Preprocess


In [2]:
pip install datasets



GPT2


In [3]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')


def load_and_split_data():
    squad = load_dataset("squad")

    print("One data sample:")
    import json
    example = squad['train'][1000]
    print(json.dumps(example, indent=2, ensure_ascii=False))

    # Load train/val dataset
    # train_data = squad['train'].select(range(5000))
    # test_ds = squad['validation'].select(range(1000))  # test set

    train_data = squad['train']
    test_ds = squad['validation']   # test set

    # Split training data into train and val
    train_ds, val_ds = train_test_split(train_data.to_list(), test_size=0.1, random_state=42)


    # Check context length - now not truncate
    context_lengths = [len(word_tokenize(example["context"])) for example in squad['train']]
    context_lengths_series = pd.Series(context_lengths)
    print("\nStats for training data:")
    print(context_lengths_series.describe(percentiles=[.5, .9, .95, .99]))

    return train_ds, val_ds, test_ds



comparison_df = pd.DataFrame(columns=[
    'model_name',
    'best_hyperparameters',
    'val_loss',
    'training_time'
])

# test dataset: EM and F1-score


# Plots function
def summarize_diagnostics(history, model_name, folder_path="plots/"):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    fig.suptitle(f"Training Diagnostics for {model_name}", fontsize=24, fontweight='bold')
    epochs = list(range(len(history['train_loss'])))

    # Plot loss curves
    ax[0].set_title('Loss Curves', fontsize=20)
    ax[0].plot(history['train_loss'], label='train')
    ax[0].plot(history['val_loss'], label='validation')
    ax[0].set_xticks(epochs)
    ax[0].set_xlabel('Epochs', fontsize=15)
    ax[0].set_ylabel('Loss', fontsize=15)
    ax[0].legend(fontsize=15)
    # Plot accuracy curves
    ax[1].set_title('Accuracy Curves', fontsize=20)
    ax[1].plot(history['train_accuracy'], label='train')
    ax[1].plot(history['val_accuracy'], label='validation')
    ax[1].set_xticks(epochs)
    ax[1].set_xlabel('Epochs', fontsize=15)
    ax[1].set_ylabel('Accuracy', fontsize=15)
    ax[1].legend(fontsize=15)

    plot_filename = os.path.join(folder_path, f"{model_name}_loss_accuracy.png")
    plt.savefig(plot_filename)
    print(f"Plot saved to {plot_filename}")

    plt.show()
    plt.close(fig)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
train,valid,test=load_and_split_data()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


One data sample:
{
  "id": "56bfe7eaa10cfb1400551389",
  "title": "Beyoncé",
  "context": "After Hurricane Katrina in 2005, Beyoncé and Rowland founded the Survivor Foundation to provide transitional housing for victims in the Houston area, to which Beyoncé contributed an initial $250,000. The foundation has since expanded to work with other charities in the city, and also provided relief following Hurricane Ike three years later.",
  "question": "How much did Beyonce initially contribute to the foundation?",
  "answers": {
    "text": [
      "$250,000"
    ],
    "answer_start": [
      190
    ]
  }
}

Stats for training data:
count    87599.000000
mean       137.876380
std         56.854154
min         22.000000
50%        127.000000
90%        211.000000
95%        245.000000
99%        325.000000
max        766.000000
dtype: float64


In [5]:
from datasets import load_dataset
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import torch

class SQuADDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item["context"]
        question = item["question"]
        answer_text = item["answers"]["text"][0]

        # Concatenate question and context like GPT format
        input_text = f"Question: {question}\nContext: {context}\nAnswer:"
        tokenized = self.tokenizer(input_text , padding='max_length',  max_length=256,  return_tensors="pt", truncation=True)
        input_ids = tokenized["input_ids"]

        return input_ids,answer_text


In [6]:
from sentence_transformers import SentenceTransformer, util
sim_model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch


#Load GPT-2 model and tokenizer
GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
GPT2tokenizer.pad_token = GPT2tokenizer.eos_token

GPT2model = GPT2LMHeadModel.from_pretrained("gpt2")
GPT2model.config.pad_token_id = GPT2tokenizer.pad_token_id
GPT2model.eval()

# GPT-2 doesn’t have a pad token by default
GPT2tokenizer.pad_token = GPT2tokenizer.eos_token
GPT2model.config.pad_token_id = GPT2tokenizer.eos_token_id

GPT2Test_data=SQuADDataset(test,GPT2tokenizer)
GPT2Test_data=DataLoader(GPT2Test_data,batch_size=64 , shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
GPT2model = GPT2model.to(device)



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
import re
from tqdm import tqdm
F1score = 0
count = 0
decision_list=[]
for batch in tqdm(GPT2Test_data):
    input_ids, answers = batch  # Assuming your Dataset returns (input_ids, answer_texts)
    input_ids = input_ids.squeeze(0).squeeze(1).to(GPT2model.device)
    with torch.no_grad():
        output = GPT2model.generate(
            input_ids=input_ids,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=GPT2tokenizer.pad_token_id
        )

    generated_text = GPT2tokenizer.decode(output[0], skip_special_tokens=True)
    match = re.search(r'Answer:\s*(.*)', generated_text)
    if match:
            answer_only = match.group(1)
    else:
            answer_only = generated_text.strip()
    yesno_prompt = f"Q: Do the following two answers have the same meaning?\nA: {answer_only}\nB: {answers[0]}\nAnswer:"
    print(yesno_prompt)

  0%|          | 0/166 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|          | 1/166 [00:00<02:44,  1.00it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Q: Do the following two answers have the same meaning?
A: It's not quite time to call it a day but, at least for me, it's not like we have to worry about that. I have been in the market for a new iPhone for a few months now and it has been a great ride
B: conscription
Answer:


  1%|          | 2/166 [00:01<02:40,  1.02it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Q: Do the following two answers have the same meaning?
A: The biggest question in the world of college athletics is whether to go to college. It's a question that has been asked countless times in college and college, but few people ask the question in their first year of college.
B: Santa Clara, California
Answer:


  2%|▏         | 3/166 [00:02<02:37,  1.03it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Q: Do the following two answers have the same meaning?
A: We're excited to announce that our new project is now live on Kickstarter.
B: Vision 2030
Answer:


  2%|▏         | 3/166 [00:03<03:02,  1.12s/it]


KeyboardInterrupt: 

In [11]:
answers, answer_only

(('tundra',
  'the most cost efficient bidder',
  '‘combs’ – groups of cilia',
  'Shirley and Johnson.',
  'Kuviasungnerk/Kangeiko',
  'Mission Impossible,',
  'enthusiastic',
  'westward',
  'road white jerseys',
  'Cam Newton',
  'By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium',
  "civil servants' salary index scale (Bundesbesoldungsordnung)",
  '2007',
  'planned to attack Fort Niagara',
  'No Child Left Behind',
  'a Committee of Independent Experts',
  'government-owned',
  '86',
  'two',
  'aerospace',
  'elected MSPs',
  '30',
  'to legalize importation of medications from Canada and other countries',
  'Abe Silverstein',
  'Warsaw',
  'Confucian propriety and ancestor veneration',
  'the Philippines',
  '244',
  'the web',
  'Complexity measures',
  'electrostatic force',
  'August 2010',
  "efforts to fortify Oswego were bogged down in logistical difficulties, exacerbated by Shirley's inexperience",
  "plans leaked to France well 

In [12]:
 answer_only

'Image copyright Reuters Image caption Thousands of people were detained in Greece in the last four years'

In [21]:

F1score = 0
count = 0
decision_list=[]
for batch in tqdm(GPT2Test_data):
    input_ids, answers = batch  # Assuming your Dataset returns (input_ids, answer_texts)
    input_ids = input_ids.squeeze(0).squeeze(1).to(GPT2model.device)
    with torch.no_grad():
        output = GPT2model.generate(
            input_ids=input_ids,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=GPT2tokenizer.pad_token_id
        )
    for i in range(len(output)):
      generated_text = GPT2tokenizer.decode(output[i], skip_special_tokens=True)
      #print(generated_text)
      match = re.search(r'Answer:\s*(.*)', generated_text)
      if match:
              answer_only = match.group(1)
      else:
              answer_only = generated_text.strip()
          #print("\nanswer:",answer_only)
      yesno_prompt = f"Q: Do the following two answers have the same meaning?\nA: {answer_only}\nB: {answers[i]}\nAnswer:"
      inputs = tokenizer(yesno_prompt, return_tensors="pt").to(model.device)
      #print(yesno_prompt)
      with torch.no_grad():
              eval_output = model.generate(**inputs, max_new_tokens=10)

      decision = tokenizer.decode(eval_output[0], skip_special_tokens=True).lower()
      #print("\ndecision:",decision)
      decision_list.append(decision)
      sim = util.cos_sim(sim_model.encode(answer_only), sim_model.encode(answers[i]))
        #  print("\ndecision:",decision)
      if 'yes' in decision or sim>0.6 :
            F1score += 1
      count += 1
    print(f'Accuracy-like score: {F1score / count:.4f}')
print(f'\n Accuracy-like score: {F1score / count:.4f}')


  0%|          | 0/166 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Question: How and when did the first variant of y. pestis enter Europe?
Context: The study also found that there were two previously unknown but related clades (genetic branches) of the Y. pestis genome associated with medieval mass graves. These clades (which are thought to be extinct) were found to be ancestral to modern isolates of the modern Y. pestis strains Y. p. orientalis and Y. p. medievalis, suggesting the plague may have entered Europe in two waves. Surveys of plague pit remains in France and England indicate the first variant entered Europe through the port of Marseille around November 1347 and spread through France over the next two years, eventually reaching England in the spring of 1349, where it spread through the country in three epidemics. Surveys of plague pit remains from the Dutch town of Bergen op Zoom showed the Y. pestis genotype responsible for the pandemic that spread through the Low Countries from 1350 differed from that found in Britain and France, implying 

  0%|          | 0/166 [00:04<?, ?it/s]

Question: What radio station did the Doctor Who theme reach the charts on in 2011?
Context: A new arrangement of the theme, once again by Gold, was introduced in the 2007 Christmas special episode, "Voyage of the Damned"; Gold returned as composer for the 2010 series. He was responsible for a new version of the theme which was reported to have had a hostile reception from some viewers. In 2011, the theme tune charted at number 228 of radio station Classic FM's Hall of Fame, a survey of classical music tastes. A revised version of Gold's 2010 arrangement had its debut over the opening titles of the 2012 Christmas special "The Snowmen", and a further revision of the arrangement was made for the 50th Anniversary special "The Day of the Doctor" in November 2013.[citation needed]
Answer:The Canadian Press

TORONTO — A new federal law that bans discrimination against gay people in Canada has sparked outrage from some Canadians who say the law is discriminatory, and in some cases, discriminat




KeyboardInterrupt: 

google/flan-t5-base

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [None]:
from torch.utils.data import DataLoader
import re
from tqdm import tqdm

Test_data = SQuADDataset(test, tokenizer)
Test_data = DataLoader(Test_data, batch_size=64, shuffle=True)

F1score = 0
count = 0
answer_list=[]
for batch in tqdm(Test_data):
    input_ids, answers = batch  # Assuming your Dataset returns (input_ids, answer_texts)
    input_ids = input_ids.squeeze(0).squeeze(1).to(model.device)
    attention_mask = (input_ids != tokenizer.pad_token_id).long()

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id
        )

    for i in range(len(outputs)):
        generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)

        match = re.search(r'Answer:\s*(.*)', generated_text)
        if match:
            answer_only = match.group(1)
        else:
            answer_only = generated_text.strip()
        #print("\nanswer:",answer_only)

        yesno_prompt = f"""Do the following two answers have the same meaning?
Answer A: {answer_only}
Answer B: {answers[i]}
Respond with 'Yes' or 'No'."""
        inputs = tokenizer(yesno_prompt, return_tensors="pt").to(model.device)
        answer_list.append(yesno_prompt)
        with torch.no_grad():
            #eval_output = model.generate(**inputs, max_new_tokens=50)
            eval_output = model.generate(
              **inputs,
              max_new_tokens=10,
              temperature=0.0,
              do_sample=False,
              pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
              eos_token_id=tokenizer.eos_token_id,
          )
        decision = tokenizer.decode(eval_output[0], skip_special_tokens=True).lower()
        #print(decision)
        answer_list.append([yesno_prompt,decision] )
        sim = util.cos_sim(sim_model.encode(answer_only), sim_model.encode(answers[i]))
      #  print("\ndecision:",decision)
        if 'yes' in decision or sim>0.6 :
            F1score += 1
        count += 1
        #print(f'Accuracy-like score: {F1score / count:.4f}')
print(f'Accuracy-like score: {F1score / count:.4f}')

100%|██████████| 166/166 [15:11<00:00,  5.49s/it]

Accuracy-like score: 0.8535





In [None]:
answers

('underground leader Piłsudski',
 '8–4–4 system',
 'concern',
 'establish, equip, manage and maintain national and public libraries in the country',
 'particular skills',
 'number one',
 'taking on debt',
 'four-course rate average')

In [None]:
answer_list=[]
for i in range(len(outputs)):
        generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)

        match = re.search(r'Answer:\s*(.*)', generated_text)
        if match:
            answer_only = match.group(1)
        else:
            answer_only = generated_text.strip()

        answer_list.append(answer_only)

In [None]:
decision_list

['no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no']

In [None]:
generated_text

"Question: Colonialism often means a country doing what?\nContext: Imperialism and colonialism both dictate the political and economic advantage over a land and the indigenous populations they control, yet scholars sometimes find it difficult to illustrate the difference between the two. Although imperialism and colonialism focus on the suppression of an other, if colonialism refers to the process of a country taking physical control of another, imperialism refers to the political and monetary dominance, either formally or informally. Colonialism is seen to be the architect deciding how to start dominating areas and then imperialism can be seen as creating the idea behind conquest cooperating with colonialism. Colonialism is when the imperial nation begins a conquest over an area and then eventually is able to rule over the areas the previous nation had controlled. Colonialism's core meaning is the exploitation of the valuable assets and supplies of the nation that was conquered and th

DeepSeek


In [7]:
!pip install transformers accelerate bitsandbytes

from transformers import AutoTokenizer, AutoModelForCausalLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "deepseek-ai/deepseek-llm-7b-chat"

# Load tokenizer and model
DS_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
DS_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Uses GPU if available
    torch_dtype="auto",
    trust_remote_code=True
)

# Prompt
input_text = "Context: The Eiffel Tower is in Paris.\nQuestion: Where is the Eiffel Tower?\nAnswer:"
inputs = DS_tokenizer(input_text, return_tensors="pt").to(device)

# Generate
outputs = DS_model.generate(**inputs, max_new_tokens=50)

print(DS_tokenizer.decode(outputs[0], skip_special_tokens=True))



Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:  12%|#2        | 1.22G/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:  28%|##7       | 1.07G/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


Context: The Eiffel Tower is in Paris.
Question: Where is the Eiffel Tower?
Answer: The Eiffel Tower is in Paris.


In [12]:
from torch.utils.data import DataLoader
import re
from tqdm import tqdm

Test_data = SQuADDataset(test, DS_tokenizer)
Test_data = DataLoader(Test_data, batch_size=64, shuffle=True)

F1score = 0
count = 0

for batch in tqdm(Test_data):
    input_ids, answers = batch  # Assuming your Dataset returns (input_ids, answer_texts)
    input_ids = input_ids.squeeze(0).squeeze(1).to(model.device)
    attention_mask = (input_ids != DS_tokenizer.pad_token_id).long()

    with torch.no_grad():
        outputs = DS_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=DS_tokenizer.pad_token_id
        )

    for i in range(len(outputs)):
        generated_text = DS_tokenizer.decode(outputs[i], skip_special_tokens=True)
        print(generated_text)
        match = re.search(r'Answer:\s*(.*)', generated_text)
        if match:
            answer_only = match.group(1)
        else:
            answer_only = generated_text.strip()
        #print("\nanswer:",answer_only)
        yesno_prompt = f"""Do the following two answers have the same meaning?
Answer A: {answer_only}
Answer B: {answers[i]}
Respond with 'Yes' or 'No'."""
        inputs = tokenizer(yesno_prompt, return_tensors="pt").to(model.device)
        print(yesno_prompt)
        with torch.no_grad():
            eval_output = model.generate(**inputs, max_new_tokens=10)

        decision = tokenizer.decode(eval_output[0], skip_special_tokens=True).lower()
        #print("\ndecision:",decision)
        sim = util.cos_sim(sim_model.encode(answer_only), sim_model.encode(answers[i]))
      #  print("\ndecision:",decision)
        if 'yes' in decision or sim>0.6 :
            F1score += 1
        count += 1

print(f'Accuracy-like score: {F1score / count:.4f}')

  0%|          | 0/166 [00:00<?, ?it/s]

Question: What is the NASUWT?
Context: Teachers in Wales can be registered members of trade unions such as ATL, NUT or NASUWT and reports in recent years suggest that the average age of teachers in Wales is falling with teachers being younger than in previous years. A growing cause of concern are that attacks on teachers in Welsh schools which reached an all-time high between 2005 and 2010.
Answer: The NASUWT is a trade union representing teachers and education support staff in the United Kingdom. It was established in 1876 and has over 45,000 members. The NASUWT's main objectives are to
Do the following two answers have the same meaning?
Answer A: The NASUWT is a trade union representing teachers and education support staff in the United Kingdom. It was established in 1876 and has over 45,000 members. The NASUWT's main objectives are to
Answer B: trade unions
Respond with 'Yes' or 'No'.
Question: What did Tem�jin promise his followers in exchange for their obedience?
Context: As an in

  0%|          | 0/166 [00:05<?, ?it/s]


KeyboardInterrupt: 