In [19]:

import os
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from rank_bm25 import BM25Okapi
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.schema import Document, BaseRetriever


In [2]:
def load_and_split_data(directory_path: str) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Smaller chunks
        chunk_overlap=100,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    docs = []
    for file in os.listdir(directory_path):
        if file.endswith(".txt"):
            with open(os.path.join(directory_path, file), 'r') as f:
                text = f.read()
                docs.extend(text_splitter.split_text(text))
    return docs

In [3]:
context_docs = load_and_split_data('./data')
print(len(context_docs))

4254


In [4]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
vector_store = FAISS.from_texts(context_docs, embeddings)

In [5]:
reranker_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")
reranker_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_documents(query: str, docs: List[str], top_k: int = 5) -> List[str]:
    pairs = [[query, doc] for doc in docs]
    inputs = reranker_tokenizer(pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    with torch.no_grad():
        scores = reranker_model(**inputs).logits.squeeze()
    
    ranked_results = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in ranked_results[:top_k]]


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

query_expansion_tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-base-v1")
query_expansion_model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-base-v1")

def expand_query(query: str, num_expansions: int = 3) -> List[str]:
    input_ids = query_expansion_tokenizer.encode(f"Expand the query: {query}", return_tensors="pt", max_length=64, truncation=True)
    outputs = query_expansion_model.generate(
        input_ids=input_ids,
        max_length=64,
        num_return_sequences=num_expansions,
        num_beams=num_expansions,
        no_repeat_ngram_size=2
    )
    expanded_queries = [query_expansion_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return [query] + expanded_queries

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
bm25_retriever = BM25Retriever.from_texts(context_docs)
bm25_retriever.k = 5  # Set the number of documents to retrieve

faiss_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.5, 0.5]
)

In [8]:
def improved_retrieval(query: str, top_k: int = 5) -> List[str]:
    # Query expansion
    expanded_queries = expand_query(query)
    
    all_docs = []
    for q in expanded_queries:
        # Hybrid retrieval
        docs = ensemble_retriever.get_relevant_documents(q)
        all_docs.extend([doc.page_content for doc in docs])
    
    # Remove duplicates
    unique_docs = list(set(all_docs))
    
    # Re-ranking
    reranked_docs = rerank_documents(query, unique_docs, top_k)
    
    return reranked_docs

In [14]:
llama_llm = OllamaLLM(model="llama3.2")

In [20]:
class ImprovedRetriever(BaseRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        docs = improved_retrieval(query)
        return [Document(page_content=doc) for doc in docs]

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return self.get_relevant_documents(query)

improved_retriever = ImprovedRetriever()

  class ImprovedRetriever(BaseRetriever):
  class ImprovedRetriever(BaseRetriever):


In [21]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate

few_shot_examples = [
    {"question": "What is the name of the annual pickle festival held in Pittsburgh?", "answer": "Picklesburgh"},
    {"question": "When was the Pittsburgh Soul Food Festival established?", "answer": "2019"},
    {"question": "Who is performing the Opera event on 7th November?", "answer": "Cavalleria Rusticana and Pagliacci"},
    {"question": "When is the Syracuse Orange vs. Robert Morris Colonials women’s ice hockey game scheduled?", "answer": "February 8, 2025"},
    {"question": "How many Super Bowls have the Pittsburgh Steelers won?", "answer": "Six"},
    {"question": "Where is the \"Like, Totally Transformative: CMU in the 1980s\" exhibit being held?", "answer": "Hunt Library Gallery"},
#     {"question": "What is the most famous viewpoint in Pittsburgh?", "answer": "Mount Washington"},
#     {"question": "Which musician known for \"Black and Yellow\" hails from Pittsburgh?", "answer": "Wiz Khalifa"},
#     {"question": "What nickname is often used for Pittsburgh?", "answer": "The Steel City"},
#     {"question": "What is the name of the annual music festival in Pittsburgh?", "answer": "Pittsburgh Taco Festival"}
]

example_prompt = PromptTemplate(
    input_variables=["question", "answer"],
    template="Question: {question}\nAnswer: {answer}"
)

few_shot_prompt = FewShotPromptTemplate(
    examples=few_shot_examples,
    example_prompt=example_prompt,
    prefix="Answer the following question based on the context provided. Keep the answer as short as possible. Do not include any other information. Here are some examples:",
    suffix="Context: {context}\nQuestion: {question}\nAnswer:",
    input_variables=["context", "question"],
    example_separator="\n\n"
)

In [22]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llama_llm,
    chain_type="stuff",
    retriever=improved_retriever,
    chain_type_kwargs={"prompt": few_shot_prompt}
)

In [23]:
def get_answer(question):
    result = qa_chain.invoke({"query": question})
    return result["result"]

In [32]:
query = "Which event is happening at Benedum Center on January 7, 2025?"
result = get_answer(query)
print(result)

No events listed


In [None]:
def get_top_k_relevant_documents(query, k=3):
    docs = improved_retriever.get_relevant_documents(query)
    return docs[:k]

top_k_docs = get_top_k_relevant_documents(query, k=5)

print(f"Top {len(top_k_docs)} relevant documents for the query: '{query}'")
for i, doc in enumerate(top_k_docs, 1):
    print(f"\nDocument {i}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document


In [33]:
# Load questions and answers from separate files
with open("./test/questions.txt", "r") as f:
    questions = [line.strip() for line in f]

with open("./test/reference_answers.txt", "r") as f:
    answers = [line.strip() for line in f]

# Combine questions and answers
test_qa_data = [{"question": q, "answer": a} for q, a in zip(questions, answers)]

In [39]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
import numpy as np
import string

def calculate_metrics(predicted, actual):
    predicted = predicted.strip().lower()
    actual_answers = [ans.strip().lower() for ans in actual.split(';')]
    
    predicted = predicted.translate(str.maketrans('', '', string.punctuation))
    actual_answers = [ans.translate(str.maketrans('', '', string.punctuation)) for ans in actual_answers]
    
    predicted_tokens = word_tokenize(predicted)
    actual_tokens = [word_tokenize(ans) for ans in actual_answers]
    
    exact_match = any(predicted == ans for ans in actual_answers)
    
    f1_scores = []
    for ans in actual_tokens:
        true_positives = len(set(predicted_tokens) & set(ans))
        precision = true_positives / len(predicted_tokens) if predicted_tokens else 0
        recall = true_positives / len(ans) if ans else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)
    
    max_f1 = max(f1_scores)
    
    answer_recall = any(all(word in predicted_tokens for word in ans) for ans in actual_tokens)
    
    return exact_match, max_f1, answer_recall


In [40]:
results = []
for qa in test_qa_data:
    question = qa['question']
    actual_answer = qa['answer']
    predicted_answer = qa_chain.invoke({"query": question})['result']
    
    exact_match, f1, answer_recall = calculate_metrics(predicted_answer, actual_answer)
    
    results.append({
        'question': question,
        'predicted_answer': predicted_answer,
        'actual_answer': actual_answer,
        'exact_match': exact_match,
        'f1_score': f1,
        'answer_recall': answer_recall
    })

average_exact_match = np.mean([r['exact_match'] for r in results])
average_f1 = np.mean([r['f1_score'] for r in results])
average_answer_recall = np.mean([r['answer_recall'] for r in results])

print(f"Average Exact Match: {average_exact_match:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")
print(f"Average Answer Recall: {average_answer_recall:.4f}")

for r in results:
    print(f"\nQuestion: {r['question']}")
    print(f"Predicted Answer: {r['predicted_answer']}")
    print(f"Actual Answer: {r['actual_answer']}")
    print(f"Exact Match: {r['exact_match']}")
    print(f"F1 Score: {r['f1_score']:.4f}")
    print(f"Answer Recall: {r['answer_recall']}")

Average Exact Match: 0.3190
Average F1 Score: 0.5180
Average Answer Recall: 0.5259

Question: What is the name of the Carnegie Mellon University mascot?
Predicted Answer: Scotty
Actual Answer: Scotty;Scotty the Scottish Terrier
Exact Match: True
F1 Score: 1.0000
Answer Recall: True

Question: What conference do the Carnegie Mellon Tartans football team compete in?
Predicted Answer: Presidents' Athletic Conference
Actual Answer: Presidents' Athletic Conference (PAC); Presidents' Athletic Conference (PAC), NCAA Division III; Presidents' Athletic Conference
Exact Match: True
F1 Score: 1.0000
Answer Recall: True

Question: What are the main professional sports teams in Pittsburgh?
Predicted Answer: Pittsburgh Pirates (MLB), Pittsburgh Steelers (NFL), and Pittsburgh Penguins (NHL)
Actual Answer: Pittsburgh Steelers (NFL), Pittsburgh Penguins (NHL), Pittsburgh Pirates (MLB); Pittsburgh Steelers, Pittsburgh Penguins and Pittsburgh Pirates; Steelers, Penguins, Pirates
Exact Match: False
F1 Sco

In [109]:
import pandas as pd

questions_df = pd.read_csv('test_questions.csv')

questions_df['Answer'] = ''

for index, row in questions_df.iterrows():
    query = row['Question']
    result = get_answer(query)
    questions_df.at[index, 'Answer'] = result

questions_df.to_csv('actual_test.csv', index=False)

print(questions_df)

                                              Question  \
0    What bank, which is the 5th largest in the US,...   
1               How many bridges does Pittsburgh have?   
2                    Who named the city of Pittsburgh?   
3    At what park do the three rivers converge in P...   
4         How many neighborhoods does Pittsburgh have?   
..                                                 ...   
569  What is the primary focus of the event at the ...   
570  Where and when is the Pittsburgh Veg Fair held...   
571  How can restaurants get involved with Pittsbur...   
572  What are the benefits of sponsoring the Pittsb...   
573  What is the recommended action for attendees w...   

                                                Answer  
0                                                  PNC  
1                                                 2020  
2                                 General John Forbes.  
3                                    Point State Park.  
4                 