In [1]:
# transfomers library from HuggingFace to load models
# datasets library from HuggingFace for the SQuAD Dataset
# evaluate library for loading rouge and bleu metrics
# peft library to load the PEFT adapter
# rouge_score library for loading rouge metrics
# rouge = Recall oriented understudy for Gisting Evaluation
# bleu = Bilingual evaluation understudy
!pip install transformers datasets evaluate peft rouge_score bleu

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bleu
  Downloading bleu-0.3.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting efficiency (from bleu)
  Downloading efficiency-2.0-py3-none-any.whl.met

In [2]:
# random library generates a random number to load random 10 samples for comparison
import random
import pandas as pd
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import PeftModel
from evaluate import load

In [3]:
# loading just the validation part of the dataset on which the models weren't trained
squad = load_dataset("squad")["validation"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
# randomly sampling 10 questions
random_questions = random.sample(list(squad), 10)

In [5]:
# Loading the google flan-t5-base model
# FLAN = Finetuned Language Network
# T5 = Text to Text Transfer Transformer
model_name = "google/flan-t5-base"

In [6]:
# Loading the model tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# Loading the base model from HuggingFace.
# This model will be used for the baseline evaluation as well as used with LoRA Adapter
base_model = T5ForConditionalGeneration.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
# Loading the Instruction fine tuned version of FLAN T5
finetuned_model_path = "/content/drive/MyDrive/T5-FineTuned/T5/fine_tuned_t5"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path)

In [9]:
# Loading the PEFT Adapter
adapter_model_path = "/content/drive/MyDrive/T5-PEFT/T5/PEFT_FLANT5_SQUAD"
peft_model = PeftModel.from_pretrained(base_model, adapter_model_path)



In [10]:
# Loading Rouge and BLEU metric.
# Lambdas for calculating EM - Exact Match and F1 scores
rouge = load("rouge")
bleu = load("bleu")
exact_match = lambda ref, pred: int(ref.strip() == pred.strip())
f1_score = lambda ref, pred: 2 * (rouge.compute(predictions=[pred], references=[ref])['rougeL'] /
                                   (1 + rouge.compute(predictions=[pred], references=[ref])['rougeL']))

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [11]:
# Generates the output
# model = Version of FLANT5
# tokenize = T5Tokenizer
# question = Question to be asked
# context = Context to answer the question from
# The model was finetuned to answer questions based on some context.
def generate_predictions(model, tokenizer, question, context):
    input_text = f"question: {question} context: {context}"
    # input_text = Question asked by user
    # return_tensor = Returning Pytorch Tensors
    # max_length = deciding the truncation threshold for the input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Use `base_model` for the PEFT adapter
    if hasattr(model, "base_model"):
        outputs = model.base_model.generate(
            # num_beams = Number of combinations of inputs tried before picking the
            # one with max cumulative probability.
            inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True
        )
    else:
        outputs = model.generate(
            inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [12]:
results = []
for sample in random_questions:
    question = sample["question"]
    context = sample["context"]
    answer = sample["answers"]["text"][0]

    base_output = generate_predictions(base_model, tokenizer, question, context)
    finetuned_output = generate_predictions(finetuned_model, tokenizer, question, context)
    adapter_output = generate_predictions(peft_model, tokenizer, question, context)

    results.append({
        "Question": question,
        "Context": context,
        "Base Output": base_output,
        "Finetuned Output": finetuned_output,
        "Adapter Output": adapter_output,
        "Answer": answer,
        "Base EM": exact_match(answer, base_output),
        "Base F1": f1_score(answer, base_output),
        "Finetuned EM": exact_match(answer, finetuned_output),
        "Finetuned F1": f1_score(answer, finetuned_output),
        "Adapter EM": exact_match(answer, adapter_output),
        "Adapter F1": f1_score(answer, adapter_output),
    })

# Convert results to a dataframe
df_outputs = pd.DataFrame({
    "Question": [r["Question"] for r in results],
    "Context": [r["Context"] for r in results],
    "Base Output": [r["Base Output"] for r in results],
    "Finetuned Output": [r["Finetuned Output"] for r in results],
    "Adapter Output": [r["Adapter Output"] for r in results],
})

In [13]:
metrics = {
    "Model": ["Base", "Finetuned", "Adapter"],
    "EM": [
        sum(r["Base EM"] for r in results) / len(results),
        sum(r["Finetuned EM"] for r in results) / len(results),
        sum(r["Adapter EM"] for r in results) / len(results),
    ],
    "F1": [
        sum(r["Base F1"] for r in results) / len(results),
        sum(r["Finetuned F1"] for r in results) / len(results),
        sum(r["Adapter F1"] for r in results) / len(results),
    ],
    "ROUGE": [
        rouge.compute(predictions=[r["Base Output"] for r in results], references=[r["Answer"] for r in results]),
        rouge.compute(predictions=[r["Finetuned Output"] for r in results], references=[r["Answer"] for r in results]),
        rouge.compute(predictions=[r["Adapter Output"] for r in results], references=[r["Answer"] for r in results]),
    ],
    "BLEU": [
        bleu.compute(predictions=[r["Base Output"] for r in results], references=[[r["Answer"]] for r in results]),
        bleu.compute(predictions=[r["Finetuned Output"] for r in results], references=[[r["Answer"]] for r in results]),
        bleu.compute(predictions=[r["Adapter Output"] for r in results], references=[[r["Answer"]] for r in results]),
    ],
}

df_metrics = pd.DataFrame(metrics)

In [14]:
# Set the maximum number of rows to display (use None for no limit)
pd.set_option('display.max_rows', None)
# Set the maximum number of columns to display (use None for no limit)
pd.set_option('display.max_columns', None)
# Set the maximum width of each column to display the full content
pd.set_option('display.max_colwidth', None)
# Prevent truncation of numbers in scientific notation
pd.set_option('display.float_format', lambda x: f'{x:.2f}')

df_outputs.head(10)

Unnamed: 0,Question,Context,Base Output,Finetuned Output,Adapter Output
0,What is the main reason consulting pharmacists are increasingly working directly with patients?,"Consultant pharmacy practice focuses more on medication regimen review (i.e. ""cognitive services"") than on actual dispensing of drugs. Consultant pharmacists most typically work in nursing homes, but are increasingly branching into other institutions and non-institutional settings. Traditionally consultant pharmacists were usually independent business owners, though in the United States many now work for several large pharmacy management companies (primarily Omnicare, Kindred Healthcare and PharMerica). This trend may be gradually reversing as consultant pharmacists begin to work directly with patients, primarily because many elderly people are now taking numerous medications but continue to live outside of institutional settings. Some community pharmacies employ consultant pharmacists and/or provide consulting services.",many elderly people are now taking numerous medications,many elderly people are now taking numerous medications,many elderly people are now taking numerous medications
1,How many square kilometres of the Amazon forest was lost by 1991?,"Between 1991 and 2000, the total area of forest lost in the Amazon rose from 415,000 to 587,000 square kilometres (160,000 to 227,000 sq mi), with most of the lost forest becoming pasture for cattle. Seventy percent of formerly forested land in the Amazon, and 91% of land deforested since 1970, is used for livestock pasture. Currently, Brazil is the second-largest global producer of soybeans after the United States. New research however, conducted by Leydimere Oliveira et al., has shown that the more rainforest is logged in the Amazon, the less precipitation reaches the area and so the lower the yield per hectare becomes. So despite the popular perception, there has been no economical advantage for Brazil from logging rainforest zones and converting these to pastoral fields.",415000,"415,000 to 587,000",415000
2,What type of compounds does oxygen most commonly form?,"Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table and is a highly reactive nonmetal and oxidizing agent that readily forms compounds (notably oxides) with most elements. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O\n2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. However, monitoring of atmospheric oxygen levels show a global downward trend, because of fossil-fuel burning. Oxygen is the most abundant element by mass in the Earth's crust as part of oxide compounds such as silicon dioxide, making up almost half of the crust's mass.",oxides,oxides,oxides
3,When was Doctor Who created?,"Since the creation of the Doctor Who character by BBC Television in the early 1960s, a myriad of stories have been published about Doctor Who, in different media: apart from the actual television episodes that continue to be produced by the BBC, there have also been novels, comics, short stories, audio books, radio plays, interactive video games, game books, webcasts, DVD extras, and even stage performances. In this respect it is noteworthy that the BBC takes no position on the canonicity of any of such stories, and producers of the show have expressed distaste for the idea.",early 1960s,early 1960s,early 1960s
4,"Other than T cells, what other immune cells express CYP27B1?","When a T-cell encounters a foreign pathogen, it extends a vitamin D receptor. This is essentially a signaling device that allows the T-cell to bind to the active form of vitamin D, the steroid hormone calcitriol. T-cells have a symbiotic relationship with vitamin D. Not only does the T-cell extend a vitamin D receptor, in essence asking to bind to the steroid hormone version of vitamin D, calcitriol, but the T-cell expresses the gene CYP27B1, which is the gene responsible for converting the pre-hormone version of vitamin D, calcidiol into the steroid hormone version, calcitriol. Only after binding to calcitriol can T-cells perform their intended function. Other immune system cells that are known to express CYP27B1 and thus activate vitamin D calcidiol, are dendritic cells, keratinocytes and macrophages.","dendritic cells, keratinocytes and macrophages","dendritic cells, keratinocytes and macrophages","dendritic cells, keratinocytes and macrophages"
5,How many tons of Saharan dust falls on the Amazon Basin each year?,"NASA's CALIPSO satellite has measured the amount of dust transported by wind from the Sahara to the Amazon: an average 182 million tons of dust are windblown out of the Sahara each year, at 15 degrees west longitude, across 1,600 miles (2,600 km) over the Atlantic Ocean (some dust falls into the Atlantic), then at 35 degrees West longitude at the eastern coast of South America, 27.7 million tons (15%) of dust fall over the Amazon basin, 132 million tons of dust remain in the air, 43 million tons of dust are windblown and falls on the Caribbean Sea, past 75 degrees west longitude.",27.7 million tons (15%),27.7 million,27.7 million tons (15%)
6,At what temperature will oxygen condense?,"Oxygen condenses at 90.20 K (−182.95 °C, −297.31 °F), and freezes at 54.36 K (−218.79 °C, −361.82 °F). Both liquid and solid O\n2 are clear substances with a light sky-blue color caused by absorption in the red (in contrast with the blue color of the sky, which is due to Rayleigh scattering of blue light). High-purity liquid O\n2 is usually obtained by the fractional distillation of liquefied air. Liquid oxygen may also be produced by condensation out of air, using liquid nitrogen as a coolant. It is a highly reactive substance and must be segregated from combustible materials.",90.20 K,90.20 K,90.20 K
7,What political party is strongest in Melbourne's working class suburbs?,"The centre-left Australian Labor Party (ALP), the centre-right Liberal Party of Australia, the rural-based National Party of Australia, and the environmentalist Australian Greens are Victoria's main political parties. Traditionally, Labor is strongest in Melbourne's working class western and northern suburbs, and the regional cities of Ballarat, Bendigo and Geelong. The Liberals' main support lies in Melbourne's more affluent eastern and outer suburbs, and some rural and regional centres. The Nationals are strongest in Victoria's North Western and Eastern rural regional areas. The Greens, who won their first lower house seats in 2014, are strongest in inner Melbourne.",Labor,Labor,Labor
8,What type of combustion does the slow reaction of triplet oxygen prevent?,"This combination of cancellations and σ and π overlaps results in dioxygen's double bond character and reactivity, and a triplet electronic ground state. An electron configuration with two unpaired electrons as found in dioxygen (see the filled π* orbitals in the diagram), orbitals that are of equal energy—i.e., degenerate—is a configuration termed a spin triplet state. Hence, the ground state of the O\n2 molecule is referred to as triplet oxygen.[b] The highest energy, partially filled orbitals are antibonding, and so their filling weakens the bond order from three to two. Because of its unpaired electrons, triplet oxygen reacts only slowly with most organic molecules, which have paired electron spins; this prevents spontaneous combustion.",spontaneous combustion,spontaneous combustion,spontaneous combustion
9,What happened to the crew onboard during the plugs-out test?,"The plugs-out test began on the morning of January 27, 1967, and immediately was plagued with problems. First the crew noticed a strange odor in their spacesuits, which delayed the sealing of the hatch. Then, communications problems frustrated the astronauts and forced a hold in the simulated countdown. During this hold, an electrical fire began in the cabin, and spread quickly in the high pressure, 100% oxygen atmosphere. Pressure rose high enough from the fire that the cabin burst and the fire erupted onto the pad area, frustrating attempts to rescue the crew. The astronauts were asphyxiated before the hatch could be opened.",asphyxiated,asphyxiated,asphyxiated


In [15]:
df_metrics

Unnamed: 0,Model,EM,F1,ROUGE,BLEU
0,Base,0.5,0.91,"{'rouge1': 0.8495555555555555, 'rouge2': 0.5132505175983437, 'rougeL': 0.8557777777777777, 'rougeLsum': 0.8518888888888889}","{'bleu': 0.5965050590819698, 'precisions': [0.8387096774193549, 0.7619047619047619, 0.7333333333333333, 0.6666666666666666], 'brevity_penalty': 0.7978725136602237, 'length_ratio': 0.8157894736842105, 'translation_length': 31, 'reference_length': 38}"
1,Finetuned,0.4,0.88,"{'rouge1': 0.8082857142857144, 'rouge2': 0.44130434782608696, 'rougeL': 0.8057619047619047, 'rougeLsum': 0.8018095238095239}","{'bleu': 0.6345222264220886, 'precisions': [0.8928571428571429, 0.8333333333333334, 0.9090909090909091, 1.0], 'brevity_penalty': 0.6996725373751302, 'length_ratio': 0.7368421052631579, 'translation_length': 28, 'reference_length': 38}"
2,Adapter,0.5,0.91,"{'rouge1': 0.8495555555555555, 'rouge2': 0.5132505175983437, 'rougeL': 0.8557777777777777, 'rougeLsum': 0.8518888888888889}","{'bleu': 0.5965050590819698, 'precisions': [0.8387096774193549, 0.7619047619047619, 0.7333333333333333, 0.6666666666666666], 'brevity_penalty': 0.7978725136602237, 'length_ratio': 0.8157894736842105, 'translation_length': 31, 'reference_length': 38}"


In [16]:
df_outputs.to_excel("Comparison_of_outputs.xlsx")

In [17]:
df_metrics.to_excel("EvaluationMetrics.xlsx")