In [1]:
!pip install transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00

In [2]:
from datasets import load_dataset
squad_dataset = load_dataset("squad", split="train[:10%]")  # Load 10% of the dataset (about 10,000 examples)
contexts = [item['context'] for item in squad_dataset]
questions = [item['question'] for item in squad_dataset]
answers = [item['answers']['text'][0] for item in squad_dataset]  # First answer for evaluation


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Fit a TF-IDF vectorizer on the contexts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(contexts)

def tfidf_retriever(query, contexts, top_k=3):
    """
    Retrieve the top-k most similar contexts based on TF-IDF cosine similarity.
    """
    query_vector = vectorizer.transform([query])
    cosine_similarities = np.dot(query_vector, tfidf_matrix.T).toarray()[0]
    top_indices = np.argsort(cosine_similarities)[-top_k:][::-1]  # Get indices of top-k contexts
    return [contexts[i] for i in top_indices]


In [5]:
def generate_answer(retrieved_contexts, query):
    input_text = "question: " + query + " context: " + " ".join(retrieved_contexts)
    inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = t5_model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [6]:
def rag_pipeline(query, contexts, top_k=3):
    # Step 1: Retrieve top-k relevant contexts using TF-IDF
    top_contexts = tfidf_retriever(query, contexts, top_k=top_k)

    # Step 2: Generate the answer based on the retrieved contexts
    answer = generate_answer(top_contexts, query)

    return {
        "query": query,
        "retrieved_documents": top_contexts,
        "generated_answer": answer
    }


In [12]:
import re
from sklearn.metrics import f1_score

def normalize_answer(s):
    """Normalize the answer by lowering the case and removing punctuation."""
    return re.sub(r'[^a-z0-9]', '', s.lower())

def evaluate_model(query, actual_answer):
    result = rag_pipeline(query, contexts)
    generated_answer = result["generated_answer"]

    # Normalize answers for better comparison
    normalized_actual_answer = normalize_answer(actual_answer)
    normalized_generated_answer = normalize_answer(generated_answer)

    # Calculate F1 score
    f1 = f1_score([normalized_actual_answer], [normalized_generated_answer], average='macro')

    return {
        "query": query,
        "retrieved_documents": result["retrieved_documents"],
        "generated_answer": generated_answer,
        "f1_score": f1
    }

# Step 9: Test the pipeline with the same example query
query = "Who is the president of the United States?"
actual_answer = answers[0]  # Use the first answer from the dataset for evaluation
result = evaluate_model(query, actual_answer)

# Display the results
print("Query:", result["query"])
print("Retrieved Documents:", result["retrieved_documents"])
print("Generated Answer:", result["generated_answer"])
print("F1 Score:", result["f1_score"])


Query: Who is the president of the United States?
Retrieved Documents: ['In 1785, the assembly of the Congress of the Confederation made New York the national capital shortly after the war. New York was the last capital of the U.S. under the Articles of Confederation and the first capital under the Constitution of the United States. In 1789, the first President of the United States, George Washington, was inaugurated; the first United States Congress and the Supreme Court of the United States each assembled for the first time, and the United States Bill of Rights was drafted, all at Federal Hall on Wall Street. By 1790, New York had surpassed Philadelphia as the largest city in the United States.', 'In 1785, the assembly of the Congress of the Confederation made New York the national capital shortly after the war. New York was the last capital of the U.S. under the Articles of Confederation and the first capital under the Constitution of the United States. In 1789, the first President 