In [1]:
# !nvidia-smi

In [2]:
import warnings
warnings.filterwarnings("ignore")   # suppress all warnings for neatness

### Chunking

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

with open("raw_data/demo.md") as f:
    file_content = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=170, chunk_overlap=20  # overlap to maintain context betn chunks
)

chunks = text_splitter.split_text(file_content)  # list of split chunks
print(f"Total chunks: {len(chunks)}")

Total chunks: 359


### Embedding

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

hf_model = HuggingFaceEmbeddings(
    model_name="all-mpnet-base-v2", # is better than all-MiniLM-L6-v2
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": False},
    multi_process=False,  # run encode() on multiple GPUs if True
)

### Vector storing

In [5]:
from langchain_community.vectorstores import FAISS
import os

db_path = "faiss_index"
if not os.path.isdir(db_path):
    # from_texts method takes a list of raw texts and embeds them with the provided embedding model
    faiss = FAISS.from_texts(texts=chunks, embedding=hf_model)
    # saving the embeddings locally
    faiss.save_local(folder_path=db_path)
else:
    print("Database already created.")

In [6]:
# Load vector store
db = FAISS.load_local(folder_path="faiss_index", embeddings=hf_model,
                      allow_dangerous_deserialization=True)

### Retrieval

In [7]:
# user_query = "What is the deadline for spring 2026 semester?"
## FAISS uses the hf_model to convert query into embedding and then searches that embedding against the stored retriever = db
## the 2nd arg specifies the top Kth nearest neighbors to retrieve
# res = db.similarity_search(query=user_query, k=2)
# [r.page_content for r in res]

#****************************************************************#
# Reference: https://shorturl.at/qnU8i (LangChain docs)

## Retrieve more documents with higher diversity
## Useful if your dataset has many similar documents
# retriever = db.as_retriever(
#     search_type="mmr",
#     search_kwargs={'k': 6, 'lambda_mult': 0.25}
# )

## Fetch more documents for the MMR algorithm to consider
## But only return the top 5
# retriever = db.as_retriever(
#     search_type="mmr",
#     search_kwargs={'k': 5, 'fetch_k': 50}
# )

## Only retrieve documents that have a relevance score
## Above a certain threshold
# retriever = db.as_retriever(
#     search_type="similarity_score_threshold",
#     search_kwargs={'score_threshold': 0.8}
# )

# Only get the single most similar document from the dataset
retriever = db.as_retriever(search_kwargs={'k': 1})

#****************************************************************#

### Generation

In [8]:
# Reference: https://python.langchain.com/docs/integrations/chat/huggingface/

from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen2.5-7B-Instruct",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
    provider="auto",  # let Hugging Face choose the best provider for you
)

chat_model = ChatHuggingFace(llm=llm)

In [9]:
from langchain_core.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If 100% out of context, immediately say you don't know.
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [10]:
# Reference: https://python.langchain.com/docs/versions/migrating_chains/retrieval_qa/#lcel

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | chat_model
    | StrOutputParser()
)

# user_query = input("Enter your question: ")
# qa_chain.invoke(input=user_query)

In [11]:
question_1 = "When is the deadline for international graduate students at Northeastern University?"
qa_chain.invoke(input=question_1)

"Based on the provided context, the deadline for international graduate students applying to US campuses for the Fall 2026 semester at Northeastern University is August 30. However, please note that deadlines can change, so it's always a good idea to check the most recent information directly from the university's official website."

In [12]:
question_2 = "When is the deadline for international graduate students at Harvard University?"
qa_chain.invoke(input=question_2)

"I don't know. The provided context does not include information about deadlines for international graduate students at Harvard University. It only lists deadlines for domestic students at unspecified institutions for the Spring 2026 and Fall 2026 semesters."