In [1]:
!pip install -q -U pyarrow==14.0.1
!pip install -q -U fsspec==2023.10.0
!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.0
!pip install python-dotenv

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.6.1 requires fsspec==2024.6.1, but you have fsspec 2023.10.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m502.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.2/109.2 kB[0m [31m9.0 MB/s[0m eta [36m0

In [2]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# # If HF token is stored in .env:
# import os
# os.environ["HF_TOKEN"] = os.getenv('HF_TOKEN')

### Tech Stack Used:
* Transformers
* Peft
* BitsAndBytes
* Accelerate
* TRL

### Basic steps Involved in fine-tuning:
1. Load the base model
2. Train the base model
3. Save the LoRA adapter
4. Reload the base model at half/full precision
5. Merge the LoRA weights with the base model

## Load Model

In [3]:
import accelerate
import bitsandbytes as bnb

print(f"Accelerate version: {accelerate.__version__}")
print(f"BitsAndBytes version: {bnb.__version__}")

Accelerate version: 0.27.1
BitsAndBytes version: 0.42.0


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#Load the model and Tokenizer
model_id = "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

## Dataset Preparation

### Loading the Dataset

I used this [Amod/mental_health_counseling_conversations](https://huggingface.co/datasets/Amod/mental_health_counseling_conversations) dataset from HF.

In [5]:
from datasets import load_dataset

dataset = load_dataset("Amod/mental_health_counseling_conversations")
dataset

Downloading readme:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})

In [6]:
# Convert HF dataset to pandas Dataframe

df = dataset["train"].to_pandas()
df.sample(10)

Unnamed: 0,Context,Response
252,"Two years ago, I was separated from the milita...",It sounds like being separated from the armed ...
2018,What are the basic skills a good counselor nee...,"To be non-judgemental, know who they themselve..."
2327,My wife is trying to leave. She agreed to come...,I'm sorry that you lost your first born child....
2796,I have four children. One of them is in her 20...,Of what do you feel you'd be guilty?Family vac...
643,She cried because she lost all trust in me. No...,"You felt bad when you lied, so you told the tr..."
712,My daughter didn't see her biological father f...,"Hi Dillon,I'm from Canada, so I don't know the..."
1313,"I'm planning to have baby, so I have to quit s...",Breaking any habit is no easy feat. Cutting ...
1132,He told me he started to back away because I w...,Have you asked your boyfriend directly this qu...
2777,I am not sure if I am depressed. I don't know ...,Depending on your relationship with your paren...
1190,"My boyfriend shows affection, but I just push ...",Are you sure it is only the birth control pill...


In [7]:
# Get the total number of NaN values in the entire DataFrame
total_na = df.isna().sum().sum()

# Print the total number of NaN values
print(f"Total number of NaN values: {total_na}")

# Optional: Show NaN values per column
na_per_column = df.isna().sum()
print("\nNaN values per column:")
print(na_per_column)

Total number of NaN values: 0

NaN values per column:
Context     0
Response    0
dtype: int64


In [8]:
def generate_prompt(data_point):
    prefix_text = 'Pretend you are a therapist. Write a response which matches the context below and is befetting of a therapist please.\n\n'
    text = f"<start_of_turn>User: {prefix_text} {data_point['Context']} <end_of_turn>\n\n<start_of_turn>Model: {data_point['Response']} <end_of_turn>"

    return text


# Add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset["train"]]
dataset = dataset["train"].add_column("prompt", text_column)
dataset

Dataset({
    features: ['Context', 'Response', 'prompt'],
    num_rows: 3512
})

In [9]:
print(dataset[0]['prompt'])

<start_of_turn>User: Pretend you are a therapist. Write a response which matches the context below and is befetting of a therapist please.

 I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.
   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.
   How can I change my feeling of being worthless to everyone? <end_of_turn>

<start_of_turn>Model: If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media.  Maybe read some of the ones which state that no person is worthless, and that everyone has a good pur

In [None]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map:   0%|          | 0/3702 [00:00<?, ? examples/s]

Train-Test Split

In [10]:
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset["train"]
test_data = dataset["test"]

print(train_data)
print(test_data)

Dataset({
    features: ['Context', 'Response', 'prompt'],
    num_rows: 3160
})
Dataset({
    features: ['Context', 'Response', 'prompt'],
    num_rows: 352
})


In [11]:
# Automated selection of target modules
import bitsandbytes as bnb

def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)


modules = find_all_linear_names(model)
print(modules)

['gate_proj', 'down_proj', 'v_proj', 'o_proj', 'k_proj', 'up_proj', 'q_proj']


In [12]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

print(model)

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # Causal Language Modeling (e.g., autoregressive models like GPT)
)

model = get_peft_model(model, lora_config)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
     

In [13]:
# Number of trainable parameters
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


## Train Model

### Push the trained model to HF

In [14]:
import transformers
from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=2500,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        warmup_steps=0.03,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=20,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        report_to="none"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()



Map:   0%|          | 0/3160 [00:00<?, ? examples/s]

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
20,3.0146
40,2.3357
60,2.3037
80,2.1913
100,2.2588


TrainOutput(global_step=100, training_loss=2.4208424377441404, metrics={'train_runtime': 747.8923, 'train_samples_per_second': 0.535, 'train_steps_per_second': 0.134, 'total_flos': 1532589582286848.0, 'train_loss': 2.4208424377441404, 'epoch': 0.13})

In [15]:
new_model = "gemma-2b-instruct-ft-My-TherAIpist"

trainer.model.save_pretrained(new_model)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model locally
# save_adapter=True, save_config=True
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/asocastro/gemma-2b-instruct-ft-My-TherAIpist/commit/6987fcde07f4b69bf6d6c9ca3e7840c758a4fd65', commit_message='Upload tokenizer', commit_description='', oid='6987fcde07f4b69bf6d6c9ca3e7840c758a4fd65', pr_url=None, pr_revision=None, pr_num=None)

## Evaluation Metrics

In [17]:
# Load/define base (non-finetuned) vs finetuned models for comparison

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from math import exp
import bitsandbytes as bnb  # Ensure bitsandbytes is installed if using quantization
from peft import PeftModel  # If using PEFT models

# Define your Hugging Face username and model IDs
username = "asocastro"
finetuned_model_id = f"{username}/gemma-2b-instruct-ft-My-TherAIpist"
base_model_id = "google/gemma-2b-it"

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "google/gemma-2b-it"

base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
base_tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

# # Load the Finetuned Model and Tokenizer from local directory
# finetuned_model_path = "merged_model"  # Path where you saved the merged model
# finetuned_model = AutoModelForCausalLM.from_pretrained(
#     finetuned_model_path,
#     torch_dtype=torch.float16,
#     device_map="auto"
# )
# finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)

# Load the Finetuned Model and Tokenizer from HF
finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_id)
finetuned_model = AutoModelForCausalLM.from_pretrained(
                  model_id,
                  torch_dtype=torch.float16,
                  device_map="auto"
)

# Ensure the model is in evaluation mode
base_model.eval()
finetuned_model.eval()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/522 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRM

### Perplexity

Perplexity is a measurement derived from the probability distribution that a language model assigns to a sequence of words (i.e., a sentence or a corpus). Specifically, it quantifies how well a probability model predicts a sample. In the realm of language modeling, perplexity serves as an indicator of how "confused" the model is when predicting the next word in a sequence.
* Lower Perplexity: Indicates that the model is better at predicting the sample. The model is less "perplexed" by the data.
* Higher Perplexity: Suggests that the model is less certain about its predictions, implying poorer performance.

In [18]:
test_data

Dataset({
    features: ['Context', 'Response', 'prompt'],
    num_rows: 352
})

In [19]:
test_data_subset = test_data.shuffle(seed=42).select(range(10))  # Get 10 random rows

In [33]:
print(test_data_subset[:5])  # Print the first 5 elements to check the structure


{'Context': ["I hate everything I see in the mirror. I don't like being in pictures and always scribble out my face. It's stressing me out. I don't trust my parents enough to tell them and I don't know what to do.", "I am pretty sure I have depression and anxiety. I also have voices in my head. I have problems sleeping too. I've already been diagnosed with attention-deficit disorder and obsessive-compulsive disorder. I have self-harmed in the last and used to be suicidal. How do I tell them this and ask for therapy?", 'We have been together over a year. We spend time together every day no matter how busy. He started to be unusually fatigued and losing weight. He also began to be distant and sexually selfish. We had a argument, and he confided he has late stage stomach cancer. He wont treat it.', "I was raped a couple months ago, Since then, along with other unfortunately events that have occurred, I have been having trouble feeling emotions. It's almost as if I'm a sociopath lacking an

In [34]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from math import exp

def calculate_perplexity(model, tokenizer, dataset, batch_size=8):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    total_tokens = 0

    # Process dataset in batches
    with torch.no_grad():
        for i in range(0, len(dataset['prompt']), batch_size):
            batch = dataset['prompt'][i:i + batch_size]

            # Tokenize the batch of prompts (strings)
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)

            # Automatically handle device placement for inputs
            inputs = {key: value.to(model.device) for key, value in inputs.items()}

            # Labels should also be moved to the same device as inputs
            labels = inputs["input_ids"].clone().to(model.device)

            # Compute model outputs and loss
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss

            # Update total loss and tokens processed
            total_loss += loss.item() * inputs["input_ids"].size(1)
            total_tokens += inputs["input_ids"].size(1)

    # Calculate perplexity
    perplexity = exp(total_loss / total_tokens)
    return perplexity

# Assuming `test_data_subset` contains fields like 'Context', 'Response', and 'prompt'
base_perplexity = calculate_perplexity(base_model, base_tokenizer, test_data_subset)
finetuned_perplexity = calculate_perplexity(finetuned_model, finetuned_tokenizer, test_data_subset)

print(f"Non-finetuned Perplexity: {base_perplexity}")
print(f"Finetuned Perplexity: {finetuned_perplexity}\n")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 14.75 GiB of which 2.96 GiB is free. Process 2309 has 11.70 GiB memory in use. Of the allocated memory 11.39 GiB is allocated by PyTorch, and 176.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Semantic Similarity
Semantic Similarity evaluates how close the model's generated responses are to the expected answers in terms of meaning.

Implementation Steps:
* Choose a Semantic Similarity Metric: Common choices include Cosine Similarity, BERTScore, or Sentence Transformers embeddings.
* Compute Similarity Scores: Compare the generated responses with the ground truth.

In [35]:
import torch
from torch.nn.functional import cosine_similarity

def calculate_semantic_similarity(model, tokenizer, dataset):
    similarities = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            # Encode the input prompt
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            # Generate output from the model
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # Tokenize generated and reference texts
            gen_inputs = tokenizer(generated_text, return_tensors='pt').to("cuda")
            ref_inputs = tokenizer(example["output"], return_tensors='pt').to("cuda")

            # Get embeddings from the last hidden state
            gen_outputs = model(**gen_inputs, output_hidden_states=True, return_dict=True)
            ref_outputs = model(**ref_inputs, output_hidden_states=True, return_dict=True)

            # Average pooling of the embeddings
            gen_embedding = gen_outputs.hidden_states[-1].mean(dim=1).squeeze()
            ref_embedding = ref_outputs.hidden_states[-1].mean(dim=1).squeeze()

            # Compute cosine similarity
            cosine_score = cosine_similarity(gen_embedding, ref_embedding, dim=0).item()
            similarities.append(cosine_score)

    average_similarity = sum(similarities) / len(similarities)
    return average_similarity

base_semantic_similarity = calculate_semantic_similarity(base_model, base_tokenizer, test_data_subset)
finetuned_semantic_similarity = calculate_semantic_similarity(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned Semantic Similarity: {base_semantic_similarity:.4f}")
print(f"Finetuned Semantic Similarity: {finetuned_semantic_similarity:.4f}\n")

KeyError: 'output'

### BLEU (Bilingual Evaluation Understudy)

BLEU measures the n-gram overlap between the generated text and reference text. It's widely used in machine translation but can be applied to other text generation tasks.

In [None]:
from datasets import load_metric

bleu = load_metric("bleu")

def calculate_bleu(model, tokenizer, dataset):
    references = []
    predictions = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            references.append([example["output"].split()])
            predictions.append(generated_text.split())

    bleu_score = bleu.compute(predictions=predictions, references=references)
    return bleu_score

base_bleu = calculate_bleu(base_model, base_tokenizer, test_data_subset)
finetuned_bleu = calculate_bleu(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned BLEU Score: {base_bleu['bleu']:.4f}")
print(f"Finetuned BLEU Score: {finetuned_bleu['bleu']:.4f}\n")

  bleu = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Non-finetuned BLEU Score: 0.4129
Finetuned BLEU Score: 0.4138



### ROUGE (Recall-Oriented Understudy for Gisting Evaluation)

ROUGE focuses on the overlap of n-grams, word sequences, and word pairs between the generated and reference texts, emphasizing recall.

In [None]:
from datasets import load_metric

rouge = load_metric("rouge")

def compute_rouge(model, tokenizer, dataset):
    references = []
    predictions = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            references.append(example["output"])
            predictions.append(generated_text)

    rouge_score = rouge.compute(predictions=predictions, references=references)
    return rouge_score

base_rouge = compute_rouge(base_model, base_tokenizer, test_data)
finetuned_rouge = compute_rouge(finetuned_model, finetuned_tokenizer, test_data)
print(f"Non-finetuned ROUGE Scores: {base_rouge}")
print(f"Finetuned ROUGE Scores: {finetuned_rouge}")

### METEOR (Metric for Evaluation of Translation with Explicit ORdering)

METEOR evaluates translation by considering synonymy and stemming, making it more flexible than BLEU.

In [None]:
from datasets import load_metric

meteor = load_metric("meteor")

def compute_meteor(model, tokenizer, dataset):
    references = []
    predictions = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            references.append(example["output"])
            predictions.append(generated_text)

    meteor_score = meteor.compute(predictions=predictions, references=references)
    return meteor_score

base_meteor = compute_meteor(base_model, base_tokenizer, test_data)
finetuned_meteor = compute_meteor(finetuned_model, finetuned_tokenizer, test_data)
print(f"METEOR Score: {base_meteor}")
print(f"Finetuned METEOR Score: {finetuned_meteor}")