<a href="https://colab.research.google.com/github/bothananth-vb/LIME-ML/blob/main/RAG(simplified).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install transformers datasets scikit-learn



In [18]:
from datasets import load_dataset
squad_dataset = load_dataset("squad", split="train[:10%]")  # Load 10% of the dataset (about 10,000 examples)



In [19]:
contexts = [item['context'] for item in squad_dataset]
questions = [item['question'] for item in squad_dataset]

from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")



In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Fit a TF-IDF vectorizer on the contexts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(contexts)

def tfidf_retriever(query, contexts, top_k=3):
    """
    Retrieve the top-k most similar contexts based on TF-IDF cosine similarity.
    """
    query_vector = vectorizer.transform([query])
    cosine_similarities = np.dot(query_vector, tfidf_matrix.T).toarray()[0]
    top_indices = np.argsort(cosine_similarities)[-top_k:][::-1]  # Get indices of top-k contexts
    return [contexts[i] for i in top_indices]

In [21]:
def generate_answer(retrieved_contexts, query):
    input_text = "question: " + query + " context: " + " ".join(retrieved_contexts)
    inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = t5_model.generate(inputs, max_length=150, early_stopping=True)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [22]:
def tfidf_rag_pipeline(query, contexts, top_k=3):
    # Step 1: Retrieve top-k relevant contexts using TF-IDF
    top_contexts = tfidf_retriever(query, contexts, top_k=top_k)

    # Step 2: Generate the answer based on the retrieved contexts
    answer = generate_answer(top_contexts, query)

    return {
        "query": query,
        "retrieved_documents": top_contexts,
        "generated_answer": answer
    }

In [23]:
query = "Who is the president of the United States?"
result = tfidf_rag_pipeline(query, contexts)

# Display the results
print("Query:", result["query"])
print("Retrieved Documents:", result["retrieved_documents"])
print("Generated Answer:", result["generated_answer"])

Query: Who is the president of the United States?
Retrieved Documents: ['In 1785, the assembly of the Congress of the Confederation made New York the national capital shortly after the war. New York was the last capital of the U.S. under the Articles of Confederation and the first capital under the Constitution of the United States. In 1789, the first President of the United States, George Washington, was inaugurated; the first United States Congress and the Supreme Court of the United States each assembled for the first time, and the United States Bill of Rights was drafted, all at Federal Hall on Wall Street. By 1790, New York had surpassed Philadelphia as the largest city in the United States.', 'In 1785, the assembly of the Congress of the Confederation made New York the national capital shortly after the war. New York was the last capital of the U.S. under the Articles of Confederation and the first capital under the Constitution of the United States. In 1789, the first President 