In [4]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install rouge_score

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.4.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [6]:
import pandas as pd

In [7]:
alpaca_prompt = """Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"] #"Generate a subject for the email body defined in Input section"
    inputs       = examples["text"]
    outputs      = examples["subject"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


In [8]:
#load test data and retrieve results/metrics for pretrained model
test_df = pd.read_csv('email/enron_subject_test.csv')


In [9]:
test_df = test_df.head(5)
test_df = test_df[['text', 'subject']]
test_subjects = test_df['subject']
test_df['subject'] = ''
#test_df['instruction'] = "Generate a subject for the email body defined in Input section in not more than 50 words"
#test_df['instruction'] = "Generate a summary for the text in Input section in not more than 50 words"
test_df['instruction'] = "Input contains email text, generate a subject for that email text in not 10 words"
from datasets import Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['subject'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['instruction'] = "Input contains email text, generate a subject for that email text in not 10 words"


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [10]:
def get_output(email_text):
  #"Generate a summary for the Input section in not more than 50 words"
  inputs = tokenizer(
                        [
                            alpaca_prompt.format(
                                # "Generate a subject for the email body defined in Input section in not more than 50 words", # instruction
                                "Generate a subject in for that email text defined in #Input in not more than 10 words",
                                email_text, # input
                                "", # output - leave this blank for generation!
                            )
                        ], return_tensors = "pt").to("cuda")

  from transformers import TextStreamer
  text_streamer = TextStreamer(tokenizer)
  result = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
  decoded = tokenizer.batch_decode(result)
  response_text = decoded[0].split("### Response:")[-1].strip().replace('<|end_of_text|>','').replace('<|begin_of_text|>:// ','').replace('<|end_of_text|>','')
  return response_text

In [34]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"
# inputs = tokenizer(test_dataset['text'], return_tensors = "pt", padding=True).to("cuda")

In [35]:
model_output=[]
for i, row in test_df.iterrows():
  email_text = row['text']
  model_output.append(get_output(email_text))

<|begin_of_text|>Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate a subject in for that email text defined in #Input in not more than 10 words
### Input:
Phillip,   Could you please do me a favor?
I would like  to read your current title policy to see what it says about easements.
You  should have received a copy during your closing.
I don't know how many  pages it will be but let me know how you want to handle getting a copy  made.
I'll be happy to make the copy, or whatever makes it easy for  you.
Thanks,
### Response:
Subject:  Easement Title Policy
Dear Phillip,
Thank you for your email. I have attached a copy of my current title  policy for your review. Please let me know if you have any questions.
Regards,
<|end_of_text|>
<|begin_of_text|>Below is a instruction that describes a task, paired with an input that provides further context. Write a respo

In [37]:
from datasets import load_metric

def get_rouge_scores(model_output, test_subjects):

  # Load the ROUGE metric
  rouge_metric = load_metric('rouge')

  # Add references and predictions to the metric
  for reference, prediction in zip(model_output, list(test_subjects)):
      rouge_metric.add(prediction=prediction, reference=reference)

  # Calculate the scores
  rouge_result = rouge_metric.compute()


  return rouge_result

In [37]:
rouge_results = get_rouge_scores(model_output, test_subjects)
# Display the results
for rouge_type in rouge_results:
    print(f"{rouge_type}: {rouge_results[rouge_type].mid}")

rouge1: Score(precision=0.29500000000000004, recall=0.02685483870967742, fmeasure=0.047619047619047616)
rouge2: Score(precision=0.18333333333333332, recall=0.011729957805907174, fmeasure=0.021520803443328552)
rougeL: Score(precision=0.29500000000000004, recall=0.02685483870967742, fmeasure=0.047619047619047616)
rougeLsum: Score(precision=0.29500000000000004, recall=0.02685483870967742, fmeasure=0.047619047619047616)


In [11]:
FastLanguageModel.for_training(model)

In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [13]:
import pandas as pd
train_df = pd.read_csv('email/enron_subject_train.csv')

In [14]:
train_df.shape

(14436, 5)

In [15]:
train_df = train_df.head(1000)

In [16]:
train_df = train_df[['text', 'subject']]
train_df['instruction'] = "Generate a subject in for that email text defined in #Input in not more than 10 words"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['instruction'] = "Generate a subject in for that email text defined in #Input in not more than 10 words"


In [17]:
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)

In [18]:
dataset

Dataset({
    features: ['text', 'subject', 'instruction'],
    num_rows: 1000
})

In [19]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [20]:
dataset[0]

{'text': 'Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n### Instruction:\nGenerate a subject in for that email text defined in #Input in not more than 10 words\n### Input:\nGreg/Phillip,  Attached is the Grande Communications Service Agreement.\nThe business points can be found in Exhibit C.  I Can get the Non-Disturbance agreement after it has been executed by you and Grande.\nI will fill in the Legal description of the property one I have received it.\nPlease execute and send to:  Grande Communications, 401 Carlson Circle, San Marcos Texas, 78666 Attention Hunter Williams.\n<<Bishopscontract.doc>>\n### Response:\nService Agreement<|end_of_text|>',
 'subject': 'Service Agreement',
 'instruction': 'Generate a subject in for that email text defined in #Input in not more than 10 words'}

In [21]:
dataset

Dataset({
    features: ['text', 'subject', 'instruction'],
    num_rows: 1000
})

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [22]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [23]:
from huggingface_hub import login
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [24]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #push_to_hub = True,
        #push_to_hub_model_id = "group13_llama31",
        #num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 20,# 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [25]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.984 GB of memory reserved.


In [26]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,3.7009
2,3.1304
3,3.1131
4,3.529
5,3.1308
6,3.1423
7,2.8214
8,2.3721
9,2.6178
10,2.3195


In [27]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

165.6915 seconds used for training.
2.76 minutes used for training.
Peak reserved memory = 9.436 GB.
Peak reserved memory for training = 3.452 GB.
Peak reserved memory % of max memory = 63.982 %.
Peak reserved memory for training % of max memory = 23.407 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [28]:
model.save_pretrained("esubjectgen_llama31_clean1") # Local saving
tokenizer.save_pretrained("esubjectgen_llama31_clean1")

('esubjectgen_llama31_clean1/tokenizer_config.json',
 'esubjectgen_llama31_clean1/special_tokens_map.json',
 'esubjectgen_llama31_clean1/tokenizer.json')

In [29]:
from huggingface_hub import notebook_login
notebook_login()

model.push_to_hub("esubjectgen_llama31_clean", token = hf_token) # Online saving
tokenizer.push_to_hub("esubjectgen_llama31_clean", token = hf_token) # Online saving

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/esubjectgen_llama31_clean


In [30]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [32]:
sub_set = test_df.head(5)

In [33]:
model_output=[]
for i, row in sub_set.iterrows():
  email_text = row['text']
  model_output.append(get_output(email_text))

<|begin_of_text|>Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate a subject in for that email text defined in #Input in not more than 10 words
### Input:
Phillip,   Could you please do me a favor?
I would like  to read your current title policy to see what it says about easements.
You  should have received a copy during your closing.
I don't know how many  pages it will be but let me know how you want to handle getting a copy  made.
I'll be happy to make the copy, or whatever makes it easy for  you.
Thanks,
### Response:
Title Policy<|end_of_text|>
<|begin_of_text|>Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate a subject in for that email text defined in #Input in not more than 10 words
### Input:
The following reports h

In [34]:
model_output


['Title Policy',
 'Reports',
 'Netco Re-start/Integration Plans',
 'Regulatory Issues and Corporate Governance',
 'Distribution of PSA balances']

In [35]:
test_subjects = test_subjects[:len(model_output)]

In [38]:
rouge_results = get_rouge_scores(model_output, test_subjects)
# Display the results
for rouge_type in rouge_results:
    print(f"{rouge_type}: {rouge_results[rouge_type].mid}")

  rouge_metric = load_metric('rouge')


rouge1: Score(precision=0.19, recall=0.32, fmeasure=0.19999999999999998)
rouge2: Score(precision=0.13333333333333333, recall=0.1, fmeasure=0.1142857142857143)
rougeL: Score(precision=0.19, recall=0.32, fmeasure=0.19999999999999998)
rougeLsum: Score(precision=0.19, recall=0.32, fmeasure=0.19999999999999998)


In [None]:
# rouge_results = get_rouge_scores(model_output, test_subjects)
# # Display the results
# for rouge_type in rouge_results:
#     print(f"{rouge_type}: {rouge_results[rouge_type].mid}")

rouge1: Score(precision=0.175, recall=0.2333333333333333, fmeasure=0.19444444444444445)
rouge2: Score(precision=0.06666666666666667, recall=0.06666666666666667, fmeasure=0.06666666666666667)
rougeL: Score(precision=0.155, recall=0.20833333333333331, fmeasure=0.1765873015873016)
rougeLsum: Score(precision=0.15499999999999997, recall=0.2, fmeasure=0.1746031746031746)


In [39]:
email_text = """Plove is going to go to Dallas.
We are going to leave next Friday when he  gets done (7ish) and go up for the game.
The game is at 11 in the morning,  so we will come home directly after it.
Plove says he has a friend who has a  place in Dallas that we can crash at if we don't want to pay for a hotel.
Do you want to go?
        """
result = get_output(email_text)


<|begin_of_text|>Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate a subject in for that email text defined in #Input in not more than 10 words
### Input:
Plove is going to go to Dallas.
We are going to leave next Friday when he  gets done (7ish) and go up for the game.
The game is at 11 in the morning,  so we will come home directly after it.
Plove says he has a friend who has a  place in Dallas that we can crash at if we don't want to pay for a hotel.
Do you want to go?
        
### Response:
Dallas game<|end_of_text|>


In [40]:
result

'Dallas game'