<a href="https://colab.research.google.com/github/bhargavi1973/RAG_Pipeline_for_LLMs/blob/main/RAGPipelineLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install wikipedia
!pip install transformers
!pip install sentence_transformers




In [7]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [4]:
import wikipedia
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [52]:
# -----------------------
# 1. Disambiguation function
def preprocess_query(query: str) -> str:
    words = query.split()
    updated_words = []
    for word in words:
        if word.upper() in DISAMBIGUATION_MAP and len(word) <= 3:
            updated_words.append(DISAMBIGUATION_MAP[word.upper()])
        else:
            updated_words.append(word)
    return " ".join(updated_words)

In [53]:
# -----------------------
# 2. Fetch Wikipedia content
def get_wikipedia_content(topic):
    topic = preprocess_query(topic)  # disambiguate topic
    try:
        page = wikipedia.page(topic)
        return page.content, topic
    except wikipedia.exceptions.PageError:
        print("Page not found.")
        return None, topic
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Ambiguous topic. Options: {e.options[:5]} ... Using first option: {e.options[0]}")
        page = wikipedia.page(e.options[0])
        return page.content, page.title


In [54]:
# -----------------------
# 3. Token-based chunking with overlap
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")

def split_text(text, chunk_size=256, chunk_overlap=20):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end]))
        if end == len(tokens):
            break
        start = end - chunk_overlap
    return chunks


In [55]:
# -----------------------
# 4. Initialize embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# -----------------------
# 5. Initialize QA model
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")

# -----------------------
# 6. Build knowledge base from topic
topic_input = input("Enter a topic to learn about: ")
document, final_topic = get_wikipedia_content(topic_input)

if not document:
    print("Could not retrieve information.")
    exit()

chunks = split_text(document)
print(f"Number of chunks for '{final_topic}': {len(chunks)}")


Device set to use cpu


Enter a topic to learn about: Electric Vehicle


Token indices sequence length is longer than the specified maximum sequence length for this model (10773 > 512). Running this sequence through the model will result in indexing errors


Number of chunks for 'Electric Vehicle': 46


In [56]:
# Create embeddings and FAISS index
embeddings = embedding_model.encode(chunks, normalize_embeddings=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # use cosine similarity with normalized embeddings
index.add(np.array(embeddings))

# Store chunk metadata
chunk_data = [{"text": chunk, "id": i, "source": final_topic} for i, chunk in enumerate(chunks)]


In [57]:
# -----------------------
# 7. Ask a user question
query = input("\nAsk a question about the topic: ")
processed_query = preprocess_query(query)
query_embedding = embedding_model.encode([processed_query], normalize_embeddings=True)



Ask a question about the topic: who invented first electric vehicle?


In [58]:
# -----------------------
# 8. Retrieve top-k chunks
k = 3
distances, indices = index.search(np.array(query_embedding), k)
retrieved_chunks = [chunk_data[i]["text"] for i in indices[0]]

print("\nRetrieved chunks (preview):")
for chunk in retrieved_chunks:
    print("- " + chunk[:200] + "...")



Retrieved chunks (preview):
- hybrid electric vehicles use electric motors as the primary propulsion method, rather than as a supplement, and did not see any mass production until the late 2000s, and battery electric cars did not ...
- robert davidson built an electric locomotive that attained a speed of four miles per hour ( 6 km / h ). in england, a patent was granted in 1840 for the use of rails as conductors of electric current,...
- an electric vehicle ( ev ) is a motor vehicle whose propulsion is powered fully or mostly by electricity. evs encompass a wide range of transportation modes, including road and rail vehicles, electric...


In [59]:
# -----------------------
# 9. Generate answer using QA model
context = " ".join(retrieved_chunks)
qa_input = {"question": processed_query, "context": context}
answer = qa_model(qa_input)

In [60]:

print("\nAnswer:")
print(answer['answer'])


Answer:
robert davidson
