In [1]:
#Copy data to collab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import shutil

In [5]:
destination_path = '/content/emailsubjects'
source_path = '/content/drive/My Drive/emailsubjects/'

# Copy the file
shutil.copytree(source_path, destination_path,dirs_exist_ok=True)

'/content/emailsubjects'

In [2]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install rouge_score

* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc
* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.
* [**NEW**] We make Gemma-2 9b / 27b **2x faster**! See our [Gemma-2 9b notebook](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)
* [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)
* [**NEW**] We make Mistral NeMo 12B 2x faster and fit in under 12GB of VRAM! [Mistral NeMo notebook](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)

In [115]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",          # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.8: Fast Gemma patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [116]:
import pandas as pd

In [117]:
# alpaca_prompt = """Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}

# ### Input:
# {}

# ### Response:
# {}"""

#email_prompt = "Generate a concise and relevant subject line for the following email body:\n\n{}\n\nSubject Line:"


alpaca_prompt = """Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"] #"Generate a subject for the email body defined in Input section"
    inputs       = examples["text"]
    outputs      = examples["subject"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


In [181]:
#load test data and retrieve results/metrics for pretrained model
test_df = pd.read_csv('emailsubjects/enron_subject_test.csv')
test_df = test_df[['text', 'subject']]
test_subjects = test_df['subject']
test_df['subject'] = ''
#test_df['instruction'] = "Generate a subject for the email body defined in Input section in not more than 50 words"
test_df['instruction'] = "Generate a summary for the text in Input section in not more than 50 words"
from datasets import Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1906 [00:00<?, ? examples/s]

In [136]:
test_df = test_df.head(15)

In [137]:
def get_test_output(email_text):
  #"Generate a summary for the Input section in not more than 50 words"
  email_prompt = "Generate a subject for the email below,:\n\n{}\n\nSubject Line:"

  inputs = tokenizer(
                        [
                            email_prompt.format(
                                email_text # input
                            )
                        ], return_tensors = "pt").to("cuda")

  from transformers import TextStreamer
  text_streamer = TextStreamer(tokenizer)
  result = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

  decoded = tokenizer.decode(result[0], skip_special_tokens=True)
  return decoded



In [138]:
def get_output(email_text):
  #"Generate a summary for the Input section in not more than 50 words"
  inputs = tokenizer(
                        [
                            alpaca_prompt.format(
                                # "Generate a subject for the email body defined in Input section in not more than 50 words", # instruction
                                "Generate a subject line for the following email",
                                email_text, # input
                                "", # output - leave this blank for generation!
                            )
                        ], return_tensors = "pt").to("cuda")

  from transformers import TextStreamer
  text_streamer = TextStreamer(tokenizer)
  # result = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)


  result = model.generate(    **inputs,
    #streamer = text_streamer,
    max_new_tokens=128,  # Limit length to encourage short outputs
    num_return_sequences=1,  # Generate one sequence
    no_repeat_ngram_size=2,  # Avoid repeating phrases
    early_stopping=True  # Stop when the model thinks it’s done
    )

  decoded = tokenizer.batch_decode(result, skip_special_tokens=True)
  response_text = decoded[0].split("### Response:")[-1].strip().replace('<|end_of_text|>','').replace('<|begin_of_text|>:// ','').replace('Subject: ','').split('\n')[0]
  return response_text[0] if isinstance(response_text, list) else response_text


In [152]:
email_text = """As our last day is Friday, November 30th, we would love to toast the good times and special memories that we have shared with you over the past five years.
Please join us at Teala's (W. Dallas) on Thursday, November 29th, beginning at 5pm.
Looking forward to being with you,   Lara and Janel     Lara Leibman
        """
result = get_output(email_text)

In [139]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"
# inputs = tokenizer(test_dataset['text'], return_tensors = "pt", padding=True).to("cuda")

In [151]:
test_df.iloc[6]['text']

"As our last day is Friday, November 30th, we would love to toast the good times and special memories that we have shared with you over the past five years.\nPlease join us at Teala's (W. Dallas) on Thursday, November 29th, beginning at 5pm.\nLooking forward to being with you,   Lara and Janel     Lara Leibman"

In [None]:
model_output=[]
for i, row in test_df.iterrows():
  email_text = row['text']
  # print(email_text)

  model_output.append(get_output(email_text))


In [None]:
model_output

In [143]:
from datasets import load_metric

def get_rouge_scores(model_output, test_subjects):

  # Load the ROUGE metric
  rouge_metric = load_metric('rouge')

  # Add references and predictions to the metric
  for reference, prediction in zip(model_output, list(test_subjects)):
      rouge_metric.add(prediction=prediction, reference=reference)

  # Calculate the scores
  rouge_result = rouge_metric.compute()


  return rouge_result

In [144]:
#model_output

In [145]:
rouge_results = get_rouge_scores(model_output, test_subjects)
# Display the results
for rouge_type in rouge_results:
    print(f"{rouge_type}: {rouge_results[rouge_type].mid}")

rouge1: Score(precision=0.2236111111111111, recall=0.17531746031746032, fmeasure=0.16869565217391305)
rouge2: Score(precision=0.05555555555555555, recall=0.06829268292682927, fmeasure=0.04747474747474747)
rougeL: Score(precision=0.21333333333333335, recall=0.16777777777777778, fmeasure=0.16135265700483092)
rougeLsum: Score(precision=0.21555555555555556, recall=0.1657142857142857, fmeasure=0.16091787439613525)


In [20]:
FastLanguageModel.for_training(model)

In [154]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = True, # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [155]:
import pandas as pd
train_df = pd.read_csv('emailsubjects/enron_subject_train.csv')

In [156]:
#train_df.shape

In [157]:
train_df = train_df.head(1000)

In [158]:
train_df = train_df[['text', 'subject']]
train_df['instruction'] = "Generate a subject for the email body defined in Input section"

In [159]:
train_df.head(5)

Unnamed: 0,text,subject,instruction
0,"Greg/Phillip, Attached is the Grande Communic...",Service Agreement,Generate a subject for the email body defined ...
1,Phillip & Keith Attached is the first draw re...,Bishops Corner,Generate a subject for the email body defined ...
2,Your Internet Banking accounts are now setup a...,Internet Banking,Generate a subject for the email body defined ...
3,To our IBS Customers that are still hanging in...,Internet Banking,Generate a subject for the email body defined ...
4,Phillip Good Morning!\nI hope you had a wonder...,SMEs for expert stories,Generate a subject for the email body defined ...


In [160]:
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)

In [161]:
dataset

Dataset({
    features: ['text', 'subject', 'instruction'],
    num_rows: 1000
})

In [162]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [163]:
dataset[0]['text']

'Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate a subject for the email body defined in Input section\n\n### Input:\nGreg/Phillip,  Attached is the Grande Communications Service Agreement.\nThe business points can be found in Exhibit C.  I Can get the Non-Disturbance agreement after it has been executed by you and Grande.\nI will fill in the Legal description of the property one I have received it.\nPlease execute and send to:  Grande Communications, 401 Carlson Circle, San Marcos Texas, 78666 Attention Hunter Williams.\n<<Bishopscontract.doc>>\n\n### Response:\nService Agreement<eos>'

In [164]:
dataset[0]

{'text': 'Below is a instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate a subject for the email body defined in Input section\n\n### Input:\nGreg/Phillip,  Attached is the Grande Communications Service Agreement.\nThe business points can be found in Exhibit C.  I Can get the Non-Disturbance agreement after it has been executed by you and Grande.\nI will fill in the Legal description of the property one I have received it.\nPlease execute and send to:  Grande Communications, 401 Carlson Circle, San Marcos Texas, 78666 Attention Hunter Williams.\n<<Bishopscontract.doc>>\n\n### Response:\nService Agreement<eos>',
 'subject': 'Service Agreement',
 'instruction': 'Generate a subject for the email body defined in Input section'}

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [165]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [166]:
from huggingface_hub import login
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [167]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        #push_to_hub = True,
        #push_to_hub_model_id = "group13_llama31",
        #num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 50,# 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [35]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
7.381 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [171]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [182]:
sub_set = test_df.head(150)

In [183]:
model_output=[]
for i, row in sub_set.iterrows():
  email_text = row['text']
  model_output.append(get_output(email_text))

In [184]:
model_output


['Title Policy',
 'Reports',
 'Netco ReStart/ Integration Plans',
 'Minor Comments',
 'Single Sum Distribution',
 'BC Seats',
 'Teala Toast',
 'Slide',
 'ENA Participation in Analyst and Associate Recruitment',
 'Enron Tower Security',
 'Enron',
 'Netco Start Up',
 'Potential deal',
 'ConEd Peaking',
 'UBS Relocation Policy',
 'Important Information Concerning the EnRON Stock Fund in EnRon Corp. Savings & ESop Plans',
 'UBS - EnRON -  Contract',
 'UBS Warbug Energy Document/Data Copy',
 'Annual Benefit',
 'Server Maintenance',
 'Enron/Dynegy Merger',
 'Enron Mail Sweeper',
 'Index Products',
 'IR',
 'Dues',
 'Dues',
 'Trade',
 'Draft Order',
 'Draft date',
 'Game tonight',
 'Bryan Hull',
 '9801 P & L',
 'Fire Drill',
 'End-Of-Holiday Sale',
 'ARCH',
 'Re:  Change of Control',
 'No Migration',
 'Southwest Airlines Vacations',
 'Fire House New Years',
 'EGM Employee Reinstatements',
 'Plan Headcount',
 'ClickAt Home',
 'Officialization',
 'EA Staff meeting',
 'Weekly Summary Report',
 'M

In [None]:
#test_subjects

In [186]:
test_subjects = test_subjects[:len(model_output)]

In [187]:
rouge_results = get_rouge_scores(model_output, test_subjects)
# Display the results
for rouge_type in rouge_results:
    print(f"{rouge_type}: {rouge_results[rouge_type].mid}")

rouge1: Score(precision=0.2909603637103637, recall=0.4126650386650388, fmeasure=0.31159777259777244)
rouge2: Score(precision=0.12151587301587302, recall=0.17447222222222222, fmeasure=0.13203683897801544)
rougeL: Score(precision=0.28521548821548826, recall=0.4059546194546195, fmeasure=0.30554160264686564)
rougeLsum: Score(precision=0.28511806711806714, recall=0.4064827024827026, fmeasure=0.3058324628587784)


In [188]:
email_text = """Plove is going to go to Dallas.
We are going to leave next Friday when he  gets done (7ish) and go up for the game.
The game is at 11 in the morning,  so we will come home directly after it.
Plove says he has a friend who has a  place in Dallas that we can crash at if we don't want to pay for a hotel.
Do you want to go?
        """
result = get_output(email_text)


In [189]:
result

'Dallas'

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [190]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [192]:
model.push_to_hub("email_gemma", token = hf_token) # Online saving
tokenizer.push_to_hub("email_gemma", token = hf_token) # Online saving

README.md:   0%|          | 0.00/577 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/200M [00:00<?, ?B/s]

Saved model to https://huggingface.co/email_gemma


  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]