In [1]:
!pip install transformers datasets farm-haystack

from datasets import load_dataset
from transformers import AutoTokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, AutoModelForSeq2SeqLM
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import BM25Retriever

# Load dataset and model
ds = load_dataset("rajpurkar/squad")
model_name = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)


Collecting farm-haystack
  Downloading farm_haystack-1.26.2-py3-none-any.whl.metadata (31 kB)
Collecting boilerpy3 (from farm-haystack)
  Downloading boilerpy3-1.0.7-py3-none-any.whl.metadata (5.8 kB)
Collecting events (from farm-haystack)
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Collecting lazy-imports==0.3.1 (from farm-haystack)
  Downloading lazy_imports-0.3.1-py3-none-any.whl.metadata (10 kB)
Collecting posthog (from farm-haystack)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting prompthub-py==4.0.0 (from farm-haystack)
  Downloading prompthub_py-4.0.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pydantic<2 (from farm-haystack)
  Downloading pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.6/151.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting quantulum3 (from farm-haystack)
  Downloading q

2024-08-17 19:51:40.711843: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-17 19:51:40.711965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-17 19:51:40.821773: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-17 19:51:52,765	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [2]:
# Preprocess dataset
def preprocess_function(examples):
    inputs = [q + " " + c for q, c in zip(examples['question'], examples['context'])]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length')
    model_inputs['labels'] = model_inputs['input_ids'].copy()
    return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True)
train_dataset = tokenized_dataset['train'].select(range(5000))
eval_dataset = tokenized_dataset['validation'].select(range(500))

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [3]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-medium-squad",
    per_device_train_batch_size=4,  # Increased batch size
    per_device_eval_batch_size=4,   # Match evaluation batch size
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    evaluation_strategy="steps",
    save_steps=100,
    eval_steps=100,
    save_total_limit=1,
    gradient_accumulation_steps=1,  # Adjusted for larger batch size
    fp16=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_steps=50  # Optimized warmup steps
)


# Train and evaluate model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()
results = trainer.evaluate()
print("Evaluation results:", results)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss
100,3.0011,1.877101
200,2.1363,1.522436
300,1.8795,1.442597
400,1.9424,1.43558
500,1.7947,1.436409
600,1.8395,1.432944




Evaluation results: {'eval_loss': 1.4333617687225342, 'eval_runtime': 28.6565, 'eval_samples_per_second': 17.448, 'eval_steps_per_second': 2.198, 'epoch': 1.0}


In [7]:

# RAG Setup
generator_tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
generator_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
document_store = InMemoryDocumentStore(use_bm25=True)
retriever = BM25Retriever(document_store=document_store)

# Add documents to document store
documents = [{"content": ctx} for ctx in ds['train']['context'][:1000]]
document_store.write_documents(documents)

Updating BM25 representation...: 100%|██████████| 119/119 [00:00<00:00, 11617.76 docs/s]


In [8]:
# Test the RAG pipeline with sample data
test_data = [
    {
        "context": "The mitochondrion is known as the powerhouse of the cell. It is an organelle found in the cytoplasm of eukaryotic cells. The mitochondrion is responsible for producing the energy currency of the cell, ATP, through a process known as oxidative phosphorylation.",
        "question": "What is the primary function of the mitochondrion?"
    },
    {
        "context": "The Great Wall of China is a series of fortifications made of various materials including brick, tamped earth, wood, and stone. It was built to protect the northern borders of the Chinese Empire from invading Mongol tribes. Construction of the wall began in the 7th century BC and continued until the 16th century.",
        "question": "What was the primary purpose of the Great Wall of China?"
    },
    {
        "context": "Albert Einstein was a theoretical physicist who developed the theory of relativity. Born in Ulm, in the Kingdom of Württemberg in the German Empire, in 1879, Einstein was awarded the Nobel Prize in Physics in 1921 for his discovery of the photoelectric effect.",
        "question": "What is Albert Einstein best known for?"
    },
    {
        "context": "The novel 'To Kill a Mockingbird' was published in 1960 and written by Harper Lee. It deals with serious issues like racial inequality and moral growth. The story is set in the American South during the 1930s and is narrated by a young girl named Scout Finch.",
        "question": "Who is the author of 'To Kill a Mockingbird'?"
    },
    ]


In [9]:
# Test RAG and GPT-2 models
for item in test_data:
    context = item['context']
    question = item['question']

    # Tokenize and generate for RAG
    rag_inputs = generator_tokenizer(question, context, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
    # Move the model to GPU
    generator_model = generator_model.to('cuda')
    rag_outputs = generator_model.generate(input_ids=rag_inputs['input_ids'].to('cuda'), attention_mask=rag_inputs['attention_mask'].to('cuda'), max_length=50)
    rag_answer = generator_tokenizer.decode(rag_outputs[0], skip_special_tokens=True)

    # Tokenize and generate for GPT-2
    gpt2_inputs = tokenizer(question + " " + context, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
    # Move the model to GPU if it's not already there
    model = model.to('cuda')
    gpt2_outputs = model.generate(input_ids=gpt2_inputs['input_ids'].to('cuda'), attention_mask=gpt2_inputs['attention_mask'].to('cuda'), max_new_tokens=50, pad_token_id=tokenizer.pad_token_id)
    gpt2_answer = tokenizer.decode(gpt2_outputs[0], skip_special_tokens=True)

    # Print comparison
    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"RAG Answer: {rag_answer}")
    print(f"GPT-2 Answer: {gpt2_answer}")
    print("-" * 80)

Question: What is the primary function of the mitochondrion?
Context: The mitochondrion is known as the powerhouse of the cell. It is an organelle found in the cytoplasm of eukaryotic cells. The mitochondrion is responsible for producing the energy currency of the cell, ATP, through a process known as oxidative phosphorylation.
RAG Answer:  The mitochondrion is an organelle found in the cytoplasm of eukaryotic cells. It is responsible for producing the energy currency of the cell, ATP, through a process known as oxidative phosphorylation. The
GPT-2 Answer: What is the primary function of the mitochondrion? The mitochondrion is known as the powerhouse of the cell. It is an organelle found in the cytoplasm of eukaryotic cells. The mitochondrion is responsible for producing the energy currency of the cell, ATP, through a process known as oxidative phosphorylation. The mitochondrion is also responsible for the synthesis of the essential amino acids needed for the synthesis of proteins, lip

In [1]:
from sklearn.metrics import f1_score

def compute_exact_match(predictions, references):
    exact_matches = [pred.strip() == ref.strip() for pred, ref in zip(predictions, references)]
    return sum(exact_matches) / len(exact_matches) * 100

def compute_f1_score(predictions, references):
    # Tokenize the predictions and references for F1 score calculation
    def tokenize(text):
        return text.lower().split()

    def f1_score_for_instance(pred, ref):
        pred_tokens = tokenize(pred)
        ref_tokens = tokenize(ref)
        common = set(pred_tokens) & set(ref_tokens)
        if len(common) == 0:
            return 0.0
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(ref_tokens)
        return (2 * precision * recall) / (precision + recall)

    f1_scores = [f1_score_for_instance(pred, ref) for pred, ref in zip(predictions, references)]
    return sum(f1_scores) / len(f1_scores) * 100

# Collect predictions
rag_predictions = [
    "The mitochondrion is an organelle found in the cytoplasm of eukaryotic cells. It is responsible for producing the energy currency of the cell, ATP, through a process known as oxidative phosphorylation.",
    "The Great Wall of China was built to protect the northern borders of the Chinese Empire from invading Mongol tribes.",
    "Albert Einstein was born in Ulm, in the Kingdom of Württemberg, in 1879. He was awarded the Nobel Prize in Physics in 1921 for his discovery of the photoelectric effect.",
    "'To Kill a Mockingbird' was published in 1960 and written by Harper Lee. It deals with serious issues like racial inequality and moral growth."
]

gpt2_predictions = [
    "What is the primary function of the mitochondrion? The mitochondrion is known as the powerhouse of the cell. It is an organelle found in the cytoplasm of eukaryotic cells. The mitochondrion is responsible for producing the energy currency of the cell, ATP, through a process known as oxidative phosphorylation. The mitochondrion is also responsible for the synthesis of the essential amino acids needed for the synthesis of proteins, lipids, and DNA. The mitochondrion is also involved in the synthesis of the essential enzymes needed for the synthesis of many other enzymes",
    "What was the primary purpose of the Great Wall of China? The Great Wall of China is a series of fortifications made of various materials including brick, tamped earth, wood, and stone. It was built to protect the northern borders of the Chinese Empire from invading Mongol tribes. Construction of the wall began in the 7th century BC and continued until the 16th century. The Great Wall of China is the largest and most complex of its kind in the world. It is the largest continuous wall in the world, and the largest in the world by area. The Great Wall of China is the largest continuous wall in the world",
    "What is Albert Einstein best known for? Albert Einstein was a theoretical physicist who developed the theory of relativity. Born in Ulm, in the Kingdom of Württemberg in the German Empire, in 1879, Einstein was awarded the Nobel Prize in Physics in 1921 for his discovery of the photoelectric effect. He was also awarded the Nobel Prize in Physics in 1928 for his theory of general relativity. He was the first person to use the term 'general relativity' to describe the theory of gravity. He was also the first person to use the term 'general",
    "Who is the author of 'To Kill a Mockingbird'? The novel 'To Kill a Mockingbird' was published in 1960 and written by Harper Lee. It deals with serious issues like racial inequality and moral growth. The story is set in the American South during the 1930s and is narrated by a young girl named Scout Finch. The novel is considered by many to be the most influential book of the 20th century. It is considered by many to be the most influential book of the 20th century. It is considered by many to be the most influential book of the 20th"
]

# Ground truths
ground_truths = [
    "The mitochondrion is known as the powerhouse of the cell. It is an organelle found in the cytoplasm of eukaryotic cells. The mitochondrion is responsible for producing the energy currency of the cell, ATP, through a process known as oxidative phosphorylation.",
    "The Great Wall of China was built to protect the northern borders of the Chinese Empire from invading Mongol tribes. Construction of the wall began in the 7th century BC and continued until the 16th century.",
    "Albert Einstein was a theoretical physicist who developed the theory of relativity. Born in Ulm, in the Kingdom of Württemberg in the German Empire, in 1879, Einstein was awarded the Nobel Prize in Physics in 1921 for his discovery of the photoelectric effect.",
    "The novel 'To Kill a Mockingbird' was published in 1960 and written by Harper Lee. It deals with serious issues like racial inequality and moral growth. The story is set in the American South during the 1930s and is narrated by a young girl named Scout Finch."
]

# Calculate Exact Match and F1 Scores
rag_em = compute_exact_match(rag_predictions, ground_truths)
rag_f1 = compute_f1_score(rag_predictions, ground_truths)
gpt2_em = compute_exact_match(gpt2_predictions, ground_truths)
gpt2_f1 = compute_f1_score(gpt2_predictions, ground_truths)

print(f"RAG Exact Match (EM): {rag_em:.2f}%")
print(f"RAG F1 Score: {rag_f1:.2f}%")
print(f"GPT-2 Exact Match (EM): {gpt2_em:.2f}%")
print(f"GPT-2 F1 Score: {gpt2_f1:.2f}%")


RAG Exact Match (EM): 0.00%
RAG F1 Score: 62.45%
GPT-2 Exact Match (EM): 0.00%
GPT-2 F1 Score: 43.84%
