In [None]:
# Install dataset library
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
# Importing necessarry libraries
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import torch

In [None]:
"""
Fine-Tuning DistilGPT-2 with LoRA on the PubMedQA Dataset

This script fine-tunes the DistilGPT-2 model using LoRA (Low-Rank Adaptation)
for medical question-answering tasks using the PubMedQA dataset.

Steps:
1. Load and preprocess the dataset
2. Initialize the tokenizer and set padding token
3. Define a tokenization function
4. Load the pretrained model and apply LoRA configuration
5. Define training arguments
6. Train the model using Hugging Face's Trainer API
7. Save the fine-tuned model for later use
8. Load Base and Fine-Tuned Models for Evaluation


"""

"\nFine-Tuning DistilGPT-2 with LoRA on the PubMedQA Dataset\n\nThis script fine-tunes the DistilGPT-2 model using LoRA (Low-Rank Adaptation)\nfor medical question-answering tasks using the PubMedQA dataset.\n\nSteps:\n1. Load and preprocess the dataset\n2. Initialize the tokenizer and set padding token\n3. Define a tokenization function\n4. Load the pretrained model and apply LoRA configuration\n5. Define training arguments\n6. Train the model using Hugging Face's Trainer API\n7. Save the fine-tuned model for later use\n\n"

In [None]:
# Step 1: Load and preprocess dataset
dataset = load_dataset("pubmed_qa", "pqa_labeled")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Print a sample from the dataset
sample = dataset["train"][0]
print(sample)

{'pubid': 21645374, 'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?', 'context': {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells i

In [None]:
# Split dataset into training and test sets
dataset = dataset["train"].train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
# step2 :Load tokenizer and set padding token
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Step3 : Define a function for tokenizing input text
def tokenize_function(examples):
    """
    Tokenizes input text by concatenating the question and long answer,
    applying truncation and padding to a fixed max length of 512 tokens.
    """
    return tokenizer(examples["question"], text_pair=examples["long_answer"], truncation=True, padding="max_length", max_length=512)

# Apply tokenization to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# Define data collator for handling labels
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False for causal language modeling
)

In [None]:
# Step 4: Load Pretrained Model and Apply LoRA
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
lora_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM") # Rank parameter # Task type for causal language modeling
model = get_peft_model(model, lora_config) # Apply LoRA modifications to the model


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [None]:
# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results", # Directory to save model checkpoints
    evaluation_strategy="epoch", #  Evaluate model at the end of each epoch
    learning_rate=5e-5, # Learning rate for optimization
    per_device_train_batch_size=1,#Training batch size per device
    per_device_eval_batch_size=1, #Evaluation batch size per device
    num_train_epochs=3, # Number of training epochs
    weight_decay=0.01, # Weight decay for regularization
    logging_dir="./logs", # Directory for logging training metrics
    save_strategy="epoch", #Save model at the end of each epoch
    report_to="none",
)



In [None]:
#Train the Fine-Tuned Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Include data collator to handle labels
)


  trainer = Trainer(


In [None]:
# Step6 :Train the model
trainer.train()
# Both training loss and validation loss are decreasing, indicating that the model is learning effectively.

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,4.1329,3.849595
2,3.9117,3.801927
3,3.9235,3.792291


TrainOutput(global_step=2400, training_loss=3.974607645670573, metrics={'train_runtime': 190.052, 'train_samples_per_second': 12.628, 'train_steps_per_second': 12.628, 'total_flos': 314643264307200.0, 'train_loss': 3.974607645670573, 'epoch': 3.0})

In [None]:
# Step7 : Save Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Fine-tuning complete!")


Fine-tuning complete!


In [None]:
# Step 8: Load Base and Fine-Tuned Models for Evaluation
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")


In [None]:
import numpy as np
# Performing Perplexity Evaluation
def compute_perplexity(model, tokenizer, dataset):
    """
    Computes the perplexity of a given model on the test dataset.
    """
    model.eval()
    losses = []
    for sample in dataset.select(range(100)):  # Limit evaluation to 100 samples for efficiency
        inputs = tokenizer(sample["question"], return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss.item()
            losses.append(loss)
    return np.exp(np.mean(losses))  # Perplexity computation



In [None]:
# Compute Perplexity for base and fine-tuned models
perplexity_base = compute_perplexity(base_model, tokenizer, test_dataset)
perplexity_fine_tuned = compute_perplexity(fine_tuned_model, fine_tuned_tokenizer, test_dataset)

print(f"Perplexity (Base Model): {perplexity_base}")
print(f"Perplexity (Fine-Tuned Model): {perplexity_fine_tuned}")

Perplexity (Base Model): 289.5865947225306
Perplexity (Fine-Tuned Model): 125.887103806811


In [None]:
#Base Model Perplexity: 315.66 (Higher = Less Fluent)
#Fine-Tuned Model Perplexity: 125.80 (Lower = More Fluent)
#perplexity scores show a significant improvement after fine-tuning
#The fine-tuned model has learned from the medical dataset (PubMedQA) and is now better at predicting the next words in a sequence.
#Lower perplexity suggests that the model generates more coherent, fluent, and relevant responses.

In [None]:
# Generate a response to a medical question
input_text = "Question: What are the symptoms of diabetes? Answer:"
input_ids = fine_tuned_tokenizer.encode(input_text, return_tensors="pt")


In [None]:
# Generate the answer
output = fine_tuned_model.generate(input_ids, max_length=100, num_return_sequences=1,repetition_penalty=1.5)
generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Answer:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Answer: Question: What are the symptoms of diabetes? Answer: Diabetes is a chronic disease that affects both body mass index and blood pressure. The prevalence rate for this condition varies widely among people with type 2 diabetic mellitus, but it has been shown to be associated with increased risk of developing Type 1 diabetics in general (1-2). This association was confirmed by an analysis conducted on 631 patients who were enrolled at University Hospital London between January 2011–December 2012; however, there have not yet


In [None]:
pip install evaluate datasets



In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=dd493b3e70f753f133b9383bf26842d69c7761d5a280d28d09727069ee7800c9
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
# Perform BLEU and ROUGE Evaluation
from evaluate import load
# Load Fine Tuned Model for Evaluation
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")

bleu = load("bleu")
rouge = load("rouge")

def evaluate_bleu_rouge(model, tokenizer, dataset, num_samples=100):
    """
    Computes BLEU and ROUGE scores for a given model on the test dataset.
    """
    predictions, references = [], []
    for sample in dataset.select(range(num_samples)):
        input_ids = tokenizer(sample["question"], return_tensors="pt", truncation=True, padding=True, max_length=512).input_ids
        output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(generated_text)
        references.append([sample["long_answer"]])

    bleu_score = bleu.compute(predictions=predictions, references=references)
    rouge_score = rouge.compute(predictions=predictions, references=references)

    return bleu_score, rouge_score


In [None]:
# Compute BLEU and ROUGE scores before and after fine-tuning
bleu_score_before, rouge_score_before = evaluate_bleu_rouge(base_model, tokenizer, test_dataset)
bleu_score_after, rouge_score_after = evaluate_bleu_rouge(fine_tuned_model, fine_tuned_tokenizer, test_dataset)

print(f"BLEU Score Before Fine-Tuning: {bleu_score_before}")
print(f"ROUGE Score Before Fine-Tuning: {rouge_score_before}")
print(f"BLEU Score After Fine-Tuning: {bleu_score_after}")
print(f"ROUGE Score After Fine-Tuning: {rouge_score_after}")

BLEU Score Before Fine-Tuning: {'bleu': 0.01675170881365391, 'precisions': [0.39445628997867804, 0.162203519510329, 0.07207953603976802, 0.03974706413730804], 'brevity_penalty': 0.14396769983030616, 'length_ratio': 0.34034833091436867, 'translation_length': 1407, 'reference_length': 4134}
ROUGE Score Before Fine-Tuning: {'rouge1': 0.2491973702374648, 'rouge2': 0.10075880993091005, 'rougeL': 0.20436195006178232, 'rougeLsum': 0.2049282447246988}
BLEU Score After Fine-Tuning: {'bleu': 0.026376196634327893, 'precisions': [0.14163090128755365, 0.03699565487274985, 0.013953488372093023, 0.006619987269255252], 'brevity_penalty': 1.0, 'length_ratio': 1.9726656990807934, 'translation_length': 8155, 'reference_length': 4134}
ROUGE Score After Fine-Tuning: {'rouge1': 0.18404297302195577, 'rouge2': 0.05361513451199873, 'rougeL': 0.15112670371355105, 'rougeLsum': 0.15102511729918638}


In [None]:
#Analysis of Results before and after finetuning:
#Perplexity Before: 289.5865947225306 and after: 125.887103806811
#BLEU Score Before: 0.0167 and After: 0.0263
#ROUGE-1 Before: 0.249 and After: 0.184
#ROUGE-2 Before: 0.100 and After: 0.053