In [1]:
!pip install torch torchvision datasets evaluate transformers accelerate -U
!pip install pandas scikit-learn tqdm bert-score


Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading

In [2]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np
from tqdm import tqdm
from bert_score import score

In [3]:
train_df = pd.read_parquet('train.parquet')
test_df = pd.read_parquet('test_without_label.parquet')

In [4]:
train_df.head(5)

Unnamed: 0,context,question,answer
9983,The world's first institution of technology or...,What year was the Banská Akadémia founded?,1735
43267,The standard specifies how speed ratings shoul...,What is another speed that can also be reporte...,SOS-based speed
81021,The most impressive and famous of Sumerian bui...,Where were the use of advanced materials and t...,Sumerian temples and palaces
49374,Ann Arbor has a council-manager form of govern...,Who is elected every even numbered year?,mayor
53414,"Shortly before his death, when he was already ...",What was the purpose of top secret ICBM commit...,decide on the feasibility of building an ICBM ...


In [5]:
test_df.head()

Unnamed: 0,context,question,answer
63695,Perhaps the most famous raid by Oeselian pirat...,What important figure was killed in the raid?,?
80051,"Following a peak in growth in 1979, the Liberi...",In 2011 Liberia's economy was considered what?,?
32271,A plethora of anti-aircraft gun systems of sma...,The combat batteries of an Army AAA battalion ...,?
52439,Avicenna's legacy in classical psychology is p...,What subject is seen throughout Avicenna's Boo...,?
33889,"The desire to explore, record and systematize ...",In what year was Charles Burney's A General Hi...,?


In [6]:
train_df.shape, test_df.shape

((20000, 3), (10000, 3))

In [7]:
train_data = train_df
test_data = test_df

train_set, val_set = train_test_split(train_data, test_size=0.2, random_state=42)
train_set.reset_index(drop=True, inplace=True)
val_set.reset_index(drop=True, inplace=True)

train_dataset = Dataset.from_pandas(train_set)
val_dataset = Dataset.from_pandas(val_set)
test_dataset = Dataset.from_pandas(test_data)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

def preprocess_samples(samples):
    inputs = [f"question: {q.strip()} context: {c.strip()}" for q, c in zip(samples["question"], samples["context"])]
    targets = [a.strip() for a in samples.get("answer", [""] * len(inputs))]

    tokenized_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    tokenized_targets = tokenizer(targets, truncation=True, padding="max_length", max_length=128)

    tokenized_inputs["labels"] = tokenized_targets["input_ids"]
    return tokenized_inputs

train_dataset = train_dataset.map(preprocess_samples, batched=True)
val_dataset = val_dataset.map(preprocess_samples, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [8]:
training_args = TrainingArguments(
    output_dir="./t5-base-qa-model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()
trainer.save_model("t5_base_QA")

results = trainer.evaluate()
print("Evaluation Results:", results)

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0102,0.007419
2,0.0104,0.007205
3,0.0097,0.007797


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation Results: {'eval_loss': 0.007204995024949312, 'eval_runtime': 105.425, 'eval_samples_per_second': 37.942, 'eval_steps_per_second': 4.743, 'epoch': 3.0}


In [9]:
def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_length=128, num_beams=3)

    return tokenizer.decode(output[0], skip_special_tokens=True)

val_results = trainer.evaluate()
print(f"Validation Loss: {val_results['eval_loss']:.4f}")

metric = evaluate.load("squad")
val_set["predicted_answer"] = val_set.apply(lambda row: generate_answer(row["question"], row["context"]), axis=1)

formatted_predictions = [
    {"id": str(i), "prediction_text": row["predicted_answer"].strip().lower()}
    for i, row in val_set.iterrows()
]
formatted_references = [
    {
        "id": str(i),
        "answers": {"text": [row["answer"].strip().lower()], "answer_start": [row["context"].find(row["answer"].strip())]},
    }
    for i, row in val_set.iterrows()
]

qa_results = metric.compute(predictions=formatted_predictions, references=formatted_references)
print(f"Exact Match (EM): {qa_results['exact_match']:.2f}%")
print(f"F1 Score: {qa_results['f1']:.2f}%")



Validation Loss: 0.0072


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Exact Match (EM): 78.85%
F1 Score: 89.71%


In [10]:
model.eval()
val_predictions = []
val_true_answers = []

for _, row in tqdm(val_set.iterrows(), total=len(val_set)):
    input_text1 = f"question: {row['question']} context: {row['context']}"
    encoding = tokenizer(input_text1, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        output = model.generate(**encoding, max_length=128, num_beams=3)

    pred_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    val_predictions.append(pred_answer)
    val_true_answers.append(row['answer'])

exact_matches = [1 if pred.strip().lower() == true.strip().lower() else 0 for pred, true in zip(val_predictions, val_true_answers)]
accuracy = np.mean(exact_matches)

def compute_f1(pred, true):
    pred_tokens = pred.lower().split()
    true_tokens = true.lower().split()
    if not true_tokens:
        return 1.0 if not pred_tokens else 0.0
    common_tokens = set(pred_tokens) & set(true_tokens)
    if not common_tokens:
        return 0.0
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(true_tokens)
    return 2 * (precision * recall) / (precision + recall)

f1_scores = [compute_f1(pred, true) for pred, true in zip(val_predictions, val_true_answers)]
f1 = np.mean(f1_scores)

P, R, F1 = score(val_predictions, val_true_answers, lang="en", model_type="microsoft/deberta-xlarge-mnli")
bert_f1 = F1.mean().item()

print(f"Validation Accuracy (EM): {accuracy * 100:.2f}%")
print(f"Validation F1 Score: {f1:.4f}")
print(f"Validation BERTScore F1: {bert_f1:.4f}")



100%|██████████| 4000/4000 [25:32<00:00,  2.61it/s]


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

Validation Accuracy (EM): 73.70%
Validation F1 Score: 0.8709
Validation BERTScore F1: 0.9285


In [11]:
test_predictions = []

for _, row in tqdm(test_data.iterrows(), total=len(test_data)):

    input_texts = f"question: {row['question']} context: {row['context']}"
    encoding = tokenizer(input_texts, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    output = model.generate(**encoding)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    test_predictions.append(answer)

test_data['answer'] = test_predictions
test_data.to_parquet('test_with_answers.parquet', index=False)

print("Predictions saved to test_with_answers.parquet")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 42%|████▏     | 4208/10000 [11:31<17:04,  5.65it/s][A
 42%|████▏     | 4210/10000 [11:32<21:52,  4.41it/s][A
 42%|████▏     | 4211/10000 [11:32<19:48,  4.87it/s][A
 42%|████▏     | 4212/10000 [11:32<18:18,  5.27it/s][A
 42%|████▏     | 4214/10000 [11:32<13:30,  7.14it/s][A
 42%|████▏     | 4215/10000 [11:32<13:34,  7.10it/s][A
 42%|████▏     | 4217/10000 [11:33<12:24,  7.77it/s][A
 42%|████▏     | 4218/10000 [11:33<16:26,  5.86it/s][A
 42%|████▏     | 4219/10000 [11:33<23:19,  4.13it/s][A
 42%|████▏     | 4221/10000 [11:33<17:15,  5.58it/s][A
 42%|████▏     | 4222/10000 [11:34<24:29,  3.93it/s][A
 42%|████▏     | 4223/10000 [11:34<29:32,  3.26it/s][A
 42%|████▏     | 4224/10000 [11:35<25:56,  3.71it/s][A
 42%|████▏     | 4225/10000 [11:35<21:50,  4.41it/s][A
 42%|████▏     | 4227/10000 [11:35<16:06,  5.97it/s][A
 42%|████▏     | 4229/10000 [11:35<14:41,  6.55it/s][A
 42%|████▏     | 4230/10000 [11:35<16:3

Predictions saved to test_with_answers.parquet





In [12]:
# Save the trained model and tokenizer
model.save_pretrained("./t5_base_qa_model")
tokenizer.save_pretrained("./t5_base_qa_tokenizer")

print("Model and tokenizer saved for Streamlit!")


Model and tokenizer saved for Streamlit!


In [13]:
import shutil

# Create a zip file of the model and tokenizer directories
shutil.make_archive('t5_base_qa_model_and_tokenizer', 'zip', './t5_base_qa_model')
shutil.make_archive('t5_base_qa_model_and_tokenizer_tokenizer', 'zip', './t5_base_qa_tokenizer')


'/content/t5_base_qa_model_and_tokenizer_tokenizer.zip'