#Email subject generation - Instruct Version of Gemma-2b

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Auto supports RoPE Scaling internally, via kaiokendev's method.
dtype = None # None for auto detection. Float16 for Tesla T4.
load_in_4bit = True # Using 4bit quantization to reduce memory usage.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
                  "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
                  "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
                  "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
                  "unsloth/llama-3-8b-Instruct-bnb-4bit",
                  "unsloth/llama-3-70b-bnb-4bit",
                  "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
                  "unsloth/Phi-3-medium-4k-instruct",
                  "unsloth/mistral-7b-bnb-4bit",
                  "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
              ] # More models at https://huggingface.co/unsloth


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
              model_name = "unsloth/gemma-2b-bnb-4bit",
              max_seq_length = max_seq_length,
              dtype = dtype,
              load_in_4bit = load_in_4bit,
              # token = "hf_...", # For gated models (when using tokens to access the org specific models)
)

==((====))==  Unsloth 2024.8: Fast Gemma patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


##**LoRA adapters**

In [None]:
##LoRA adapters (Updates 1 to 10% of all parameters)
## #"unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
model = FastLanguageModel.get_peft_model(
                        model,
                        r = 16, # (or 8, 16, 32, 64, 128)
                        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
                        lora_alpha = 16,
                        lora_dropout = 0, # 0 is optimized
                        bias = "none",    # "none" is optimized
                        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
                        random_state = 5000, # 3407,
                        use_rslora = False,  # Using Rank stabilized LoRA
                        loftq_config = None, # And LoftQ
                      )

In [None]:
!git clone https://github.com/ryanzhumich/AESLC.git

fatal: destination path 'AESLC' already exists and is not an empty directory.


In [None]:
#Generate JSON file from email dataset
import os
import json
import pandas as pd

# Define the folder containing the text files
folder_path = '/content/AESLC/enron_subject_line/train'

# Initialize lists to store the data
data = []
instruction = 'Please help summarize the provided email body and generate email subject'
# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".subject"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            # Split the content into body and subject
            if '@subject' in content:
                body_text, subject_text = content.split('@subject')
                data.append({
                    'instruction': instruction,
                    'input': body_text.strip(),
                    'output': subject_text.strip()
                })

# Save the data to a JSON file
json_path = '/content/dataset.json'
with open(json_path, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"JSON file saved to {json_path}")

JSON file saved to /content/dataset.json


##Data preparation


In [None]:
email_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # EOS_TOKEN, to limit generation to avoid forever generation!
        text = email_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import Dataset
import json
# Load your custom dataset
json_path = '/content/dataset.json'

with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
#from datasets import load_dataset
#dataset = load_dataset("/content/dataset.json", split = "train")
dataset = Dataset.from_list(data)
dataset = dataset.map(formatting_prompts_func, batched = True,)


Map:   0%|          | 0/14436 [00:00<?, ? examples/s]

##Train the model



In [None]:
#Using Huggingface TRL's `SFTTrainer`[TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer).

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
                  per_device_train_batch_size = 2,
                  gradient_accumulation_steps = 4,
                  warmup_steps = 5,
                  max_steps = 40, ##Tweaked from 60
                  learning_rate = 2e-4,
                  fp16 = not is_bfloat16_supported(),
                  bf16 = is_bfloat16_supported(),
                  logging_steps = 1,
                  optim = "adamw_8bit",
                  weight_decay = 0.01,
                  lr_scheduler_type = "linear",
                  seed = 5000, # 3407,
                  output_dir = "outputs",
              ),
)

#Try & set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. Support TRL's `DPOTrainer`!


Map (num_proc=2):   0%|          | 0/14436 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


###Current memory stats

In [None]:
print("Current Memory Stats:")
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
# print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
# print(f"{start_gpu_memory} GB of memory reserved.")
print(f"GPU = {gpu_stats.name}")
print(f"Max memory = {max_memory} GB")
print(f"Memory reserved = {start_gpu_memory} GB")
print("-----------------------------------------------------------------")

Current Memory Stats:
GPU = Tesla T4
Max memory = 14.748 GB
Memory reserved = 4.506 GB
-----------------------------------------------------------------


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,436 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 40
 "-____-"     Number of trainable parameters = 19,611,648


Step,Training Loss
1,3.259
2,3.4076
3,3.2123
4,3.2756
5,2.9801
6,2.5915
7,3.1101
8,2.9284
9,2.8553
10,2.6431


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print("Training Memory Stats:")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
print("-----------------------------------------------------------------------------------------")

Training Memory Stats:
2.03 minutes used for training.
121.5326 seconds used for training.
Peak reserved memory = 8.48 GB.
Peak reserved memory for training = 6.179 GB.
Peak reserved memory % of max memory = 57.499 %.
Peak reserved memory for training % of max memory = 41.897 %.
-----------------------------------------------------------------------------------------


<a name="Inference"></a>
### Inference

In [None]:
#Run the model! Change the instruction & input, and leave output blank!
#email_prompt = Copied from above
FastLanguageModel.for_inference(model) #Enable native 2x faster inference
inputs = tokenizer(
[
    email_prompt.format(
        "Please help summarize the provided email body and generate email subject", #Instruction
        "Kevin Presto is requesting that you attend a meeting regarding Organizing an Action Plan for the Start-up of Netco.\nThe meeting will be held in ECS 06716 at 9:30 am, Wednesday, January 2, 2002.\nFor Tim and Chris, could you please call 713-584-2067.\nThis is the telephone number in the conference room.\nIf you should have any questions, please call T Jae Black at 3-5800.\nThanks", #input
        "", #output - leave this blank for generation!
    )
  ], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

##Substring to fetch only response


['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nPlease help summarize the provided email body and generate email subject\n\n### Input:\nKevin Presto is requesting that you attend a meeting regarding Organizing an Action Plan for the Start-up of Netco.\nThe meeting will be held in ECS 06716 at 9:30 am, Wednesday, January 2, 2002.\nFor Tim and Chris, could you please call 713-584-2067.\nThis is the telephone number in the conference room.\nIf you should have any questions, please call T Jae Black at 3-5800.\nThanks\n\n### Response:\nKevin Presto is requesting that you attend a meeting regarding Organizing an Action Plan for the Start-up of Netco.<eos>']

In [None]:
#Use `TextStreamer` for continuous inference - to view generation token by token, instead of waiting for the entire duration.
#email_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    email_prompt.format(
        "Please help summarize the provided email body and generate email subject", # instruction
        "Kevin Presto is requesting that you attend a meeting regarding Organizing an Action Plan for the Start-up of Netco.\nThe meeting will be held in ECS 06716 at 9:30 am, Wednesday, January 2, 2002.\nFor Tim and Chris, could you please call 713-584-2067.\nThis is the telephone number in the conference room.\nIf you should have any questions, please call T Jae Black at 3-5800.\nThanks", #input

        #"Phillip,   Could you please do me a favor?I would like  to read your current title policy to see what it says about easements.You  should have received a copy during your closing.I don't know how many  pages it will be but let me know how you want to handle getting a copy  made.I'll be happy to make the copy, or whatever makes it easy for  you.Thanks,", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)


<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please help summarize the provided email body and generate email subject

### Input:
Kevin Presto is requesting that you attend a meeting regarding Organizing an Action Plan for the Start-up of Netco.
The meeting will be held in ECS 06716 at 9:30 am, Wednesday, January 2, 2002.
For Tim and Chris, could you please call 713-584-2067.
This is the telephone number in the conference room.
If you should have any questions, please call T Jae Black at 3-5800.
Thanks

### Response:
Kevin Presto is requesting that you attend a meeting regarding Organizing an Action Plan for the Start-up of Netco.<eos>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, use `save_pretrained` to save locally (otherwise Huggingface's `push_to_hub` to save online).


In [None]:
model.save_pretrained("EmailSubGen_Gemma2b_lora_model") # Local saving
tokenizer.save_pretrained("EmailSubGen_Gemma2b_lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('EmailSubGen_Gemma2b_lora_model/tokenizer_config.json',
 'EmailSubGen_Gemma2b_lora_model/special_tokens_map.json',
 'EmailSubGen_Gemma2b_lora_model/tokenizer.model',
 'EmailSubGen_Gemma2b_lora_model/added_tokens.json',
 'EmailSubGen_Gemma2b_lora_model/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "EmailSubGen_Gemma2b_lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        #load_in_8bit_fp32_cpu_offload=True, # Add this line to enable CPU offloading
        device_map={"":0} # Add this line to specify GPU 0 for model placement
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    email_prompt.format(
        "Please help summarize the provided email body and generate email subject", # instruction
        "The following reports have been waiting for your approval for more than 4 days.Please review.Owner: James W Reitmeyer Report Name: JReitmeyer 10/24/01 Days In Mgr.Queue: 5", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

==((====))==  Unsloth 2024.8: Fast Gemma patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nPlease help summarize the provided email body and generate email subject\n\n### Input:\nThe following reports have been waiting for your approval for more than 4 days.Please review.Owner: James W Reitmeyer Report Name: JReitmeyer 10/24/01 Days In Mgr.Queue: 5\n\n### Response:\nJReitmeyer 10/24/01 Days In Mgr.Queue: 5<eos>']

##Save model - locally

In [None]:
# prompt: zip folder /content/EmailSubGen_Gemma2_lora_model and upload to google drive

!zip -r /content/EmailSubGen_Gemma2_lora_model.zip /content/EmailSubGen_Gemma2_lora_model
from google.colab import drive
drive.mount('/content/drive')
!cp /content/EmailSubGen_Gemma2_lora_model.zip /content/drive/MyDrive


zip error: Nothing to do! (try: zip -r /content/EmailSubGen_Gemma2_lora_model.zip . -i /content/EmailSubGen_Gemma2_lora_model)
Mounted at /content/drive
cp: cannot stat '/content/EmailSubGen_Gemma2_lora_model.zip': No such file or directory


In [None]:
import os
import json
import pandas as pd

# Define the folder containing the text files
folder_path = '/content/AESLC/enron_subject_line/test'

# Initialize lists to store the data
data = []
instruction = 'Please help summarize the provided email body and generate email subject'
# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".subject"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            # Split the content into body and subject
            if '@subject' in content:
                body_text, subject_text = content.split('@subject')

                lines = subject_text.strip().splitlines()  # Split by lines and remove leading/trailing whitespace
                output = []
                for line in lines:
                    if line.strip():
                        if line.startswith("@"):
                            annotation = line.split()[1:]  # Extract annotation text after "@" and split by space
                            if len(annotation):
                                output.append("".join(annotation))  # Join words in annotation back together
                        else:
                            output.append(line.strip())  # Add subject or remaining text after removing whitespace
                data.append({
                    'instruction': instruction,
                    'input': body_text.strip(),
                    'output': output
                })

# Save the data to a JSON file
json_path = '/content/AESLC/enron_subject_line/test/testdataset.json'
with open(json_path, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"JSON file saved to {json_path}")


JSON file saved to /content/AESLC/enron_subject_line/test/testdataset.json


In [None]:
email_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = email_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import Dataset
import json
# Load your custom dataset
json_path = '/content/AESLC/enron_subject_line/test/testdataset.json'

with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

testdataset = Dataset.from_list(data[:100])
testdataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/14436 [00:00<?, ? examples/s]

#Rouge

In [None]:
%%capture
!pip install rouge
!pip install evaluate
!pip install rouge_score

In [None]:
from transformers import pipeline
from evaluate import load

# Load the ROUGE metric
rouge = load("rouge")

# Create a text generation pipeline
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

# Generate predictions on the test dataset
# Access the input column of the dataset using the column name
predictions = generator(
    testdataset[:2000]["input"],
    max_new_tokens=8,
    num_beams=1,
)


The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

In [None]:
# Extract the generated text from the pipeline output
predictions = [pred[0]['generated_text'] for pred in predictions]


In [None]:
# Compute ROUGE metrics
results = rouge.compute(predictions=predictions, references=testdataset[:2000]["output"])
print(results)

In [None]:
##Observations - AUG 8th-9th:
##--------------------------------------------------
# <4min   max_steps = 50    testdataset[:100]["input"],max_new_tokens=20,  ----  {'rouge1': 0.04703338794882978, 'rouge2': 0.018094740811992135, 'rougeL': 0.04291153191710728, 'rougeLsum': 0.044421997423522985}
# 4mins   max_steps = 50    testdataset[:300]["input"],max_new_tokens=20,  ----  {'rouge1': 0.04763686796739476, 'rouge2': 0.018590431359526753, 'rougeL': 0.04346346295047031, 'rougeLsum': 0.04518140308108524}
# 5mins   max_steps = 50    testdataset[:400]["input"],max_new_tokens=20,  ----  {'rouge1': 0.04841968575892602, 'rouge2': 0.018794301094831288, 'rougeL': 0.04393371452433795, 'rougeLsum': 0.04536474990806128}
# 12mins   max_steps = 50    testdataset[:800]["input"],max_new_tokens=20,  ---- {'rouge1': 0.04890688956561573, 'rouge2': 0.018216384865342265, 'rougeL': 0.0442624056719164, 'rougeLsum': 0.04646280874733754}
# 10mins   max_steps = 40, seed=5000    testdataset[:800]["input"],max_new_tokens=20,  ----  {'rouge1': 0.05063502156641153, 'rouge2': 0.018875734537863367, 'rougeL': 0.04545903906916432, 'rougeLsum': 0.04815762881014813}
# 8mins   max_steps = 40, seed=5000    testdataset[:800]["input"],max_new_tokens=15,  ----  {'rouge1': 0.052136084070747046, 'rouge2': 0.019510195273507815, 'rougeL': 0.04683298782296529, 'rougeLsum': 0.04945309030336453}
# 14mins   max_steps = 40, seed=5000   testdataset[:1200]["input"],max_new_tokens=20,  ----  {'rouge1': 0.05058661057227369, 'rouge2': 0.019035910262427347, 'rougeL': 0.04597646530256609, 'rougeLsum': 0.04820730672046365}
# 7mins   max_steps = 40, seed=5000    testdataset[:1200]["input"],max_new_tokens=15,  ----  {'rouge1': 0.05206475367626734, 'rouge2': 0.019660981842852962, 'rougeL': 0.047334418031740016, 'rougeLsum': 0.04955124524476833}
# 7mins   max_steps = 40, seed=5000    testdataset[:1200]["input"],max_new_tokens=15,  ----  {'rouge1': 0.05378414875142114,  'rouge2': 0.020305912892399518, 'rougeL': 0.04887783094017359,  'rougeLsum': 0.051091497309520784}
# 14mins   max_steps = 40, seed=5000   testdataset[:2000]["input"],max_new_tokens=10,  ----  {'rouge1': 0.052309203096362944, 'rouge2': 0.019196742044162142, 'rougeL': 0.04750068460138787, 'rougeLsum': 0.04933446927139551}

# max_steps = 50    testdataset[:100]["input"],max_new_tokens=10,  ----  {'rouge1': 0.05097113185293092, 'rouge2': 0.01915434455190274, 'rougeL': 0.046507767697487296, 'rougeLsum': 0.0481098915021589}


{'rouge1': 0.052309203096362944, 'rouge2': 0.019196742044162142, 'rougeL': 0.04750068460138787, 'rougeLsum': 0.04933446927139551}


You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### GGUF / llama.cpp Conversion
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.


In [None]:
# Save to q4_k_m GGUF
model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

In [None]:
# prompt: copy the unsloth.Q4_K_M.gguf to google drive

from google.colab import drive
drive.mount('/content/drive')

!cp /content/model/unsloth.Q4_K_M.gguf /content/drive/MyDrive

#Gradio App

In [None]:
%%capture
!pip install gradio

In [None]:
import gradio as gr

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
nltk.download('punkt')

def generate_subject(model_name,email_body):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  inputs = ["provide email subject: " + email_body]
  inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
  output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=10)
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
  predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
  return predicted_title

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
iface = gr.Interface(
    fn=generate_subject,
    inputs=[
        #CHECK - More options to be added to Gradio
        gr.Dropdown(choices=["EmailSubGen_Gemma2b_lora_model", "anukvma/t5-base-medium-email-subject-generation-v2", "anukvma/bart-base-medium-email-subject-generation-v5"], label="Select Model"),
        gr.Textbox(lines=5, label="Email Body")
    ],
    outputs=gr.Textbox(label="Email Subject")
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://46ee291de79d4d049b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


