In [1]:
import os
from langchain_community.retrievers import WikipediaRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
import torch
import random
import numpy as np
import pandas as pd
import ast
from datasets import load_dataset
import random
from evaluation import EvaluationMetrics
import sys
import time

os.environ["OPENAI_API_KEY"] = "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"




In [2]:
# This part is for normal RAG pipeline

'''
from langchain.embeddings import OpenAIEmbeddings
from scipy.spatial.distance import cosine
import random

class CustomWikipediaRetriever:
    def __init__(self, k=3, top_m=10, embedding_model=None, threshold=0.7):
        self.retriever = WikipediaRetriever(top_k_results=top_m)
        self.embedding_model = embedding_model or OpenAIEmbeddings()
        self.k = k # Number of documents to sample 
        self.docs = None
        self.doc_embeddings = None
        self.threshold = threshold

    def retrieve(self, query):
        docs = self.retriever.get_relevant_documents(query)
        doc_embeddings = [self.embedding_model.embed_documents([doc.page_content])[0] for doc in docs]
        doc_embeddings = [emb / np.linalg.norm(emb) for emb in doc_embeddings]
        
        selected_idx = [random.choice(range(len(docs)))]
        selected_embeddings = [doc_embeddings[selected_idx[0]]]

        remaining_indices = list(set(range(len(docs))) - {selected_idx[0]})
        while len(selected_idx) < self.k and remaining_indices:
            candidate_idx = random.choice(remaining_indices)
            candidate_embedding = doc_embeddings[candidate_idx]

            similarity_ok = all((1+cosine(candidate_embedding, emb))/2.0 < self.threshold for emb in selected_embeddings)

            if similarity_ok:
                selected_idx.append(candidate_idx)
                selected_embeddings.append(candidate_embedding)

            remaining_indices.remove(candidate_idx)

        selected_docs = [docs[idx] for idx in selected_idx]
        return selected_docs


class QAChain:
    def __init__(self, k: int = 3, top_m: int = 10, threshold: float = 0.7, embedding_model=None):
        self.retriever = CustomWikipediaRetriever(top_m=top_m, k=k, threshold=threshold, embedding_model=embedding_model)
        self.prompt = ChatPromptTemplate.from_template(
            """Answer the question based only on the context provided as short as possible.

            Context: {context}

            Question: {question}"""
        )
        self.llm = ChatOpenAI(model="gpt-3.5-turbo")
        
        self.chain = (
            {"context": self.retrieve_docs, "question": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | StrOutputParser()
        ) 

    def retrieve_docs(self, query):
        
        # for normal RAG pipeline
        docs = self.retriever.retrieve(query)
        return "\n\n".join(doc.page_content for doc in docs)
    
    
    def answer(self, question: str):
        return self.chain.invoke(question)

qa_chain = QAChain(k=3)
query = "What is the capital of France?"
answer = qa_chain.answer(query)
print(answer)
'''


'\nfrom langchain.embeddings import OpenAIEmbeddings\nfrom scipy.spatial.distance import cosine\nimport random\n\nclass CustomWikipediaRetriever:\n    def __init__(self, k=3, top_m=10, embedding_model=None, threshold=0.7):\n        self.retriever = WikipediaRetriever(top_k_results=top_m)\n        self.embedding_model = embedding_model or OpenAIEmbeddings()\n        self.k = k # Number of documents to sample \n        self.docs = None\n        self.doc_embeddings = None\n        self.threshold = threshold\n\n    def retrieve(self, query):\n        docs = self.retriever.get_relevant_documents(query)\n        doc_embeddings = [self.embedding_model.embed_documents([doc.page_content])[0] for doc in docs]\n        doc_embeddings = [emb / np.linalg.norm(emb) for emb in doc_embeddings]\n        \n        selected_idx = [random.choice(range(len(docs)))]\n        selected_embeddings = [doc_embeddings[selected_idx[0]]]\n\n        remaining_indices = list(set(range(len(docs))) - {selected_idx[0]})

In [4]:
# For normal RAG pipeline
'''

qa_chain.answer("who is the founder of quantum physics")
'''

'\n\nqa_chain.answer("who is the founder of quantum physics")\n'

Using similarity threshold as parameter:
similarity_ok = all((1+cosine(candidate_embedding, emb))/2.0 < self.similarity_threshold for emb in selected_embeddings) 

Using distance threshold as parameter:
distance_ok = all((1-cosine(candidate_embedding, emb))/2.0 >= self.distance_threshold for emb in selected_embeddings) 

The above two definitions are equivalent:
distance_threshold =  1 - similarity_threshold

In [5]:
# This part is for evaluation only

from langchain.embeddings import OpenAIEmbeddings
from scipy.spatial.distance import cosine
import random

class CustomWikipediaRetriever:
    def __init__(self, k=3, top_m=10, embedding_model=None):
        self.retriever = WikipediaRetriever(top_k_results=top_m)
        self.embedding_model = embedding_model or OpenAIEmbeddings()
        self.k = k # Number of documents to sample 
        self.docs = None
        self.doc_embeddings = None
    
    def retrieve_with_embeddings(self, query):
        docs = self.retriever.get_relevant_documents(query)
        
        doc_embeddings = [self.embedding_model.embed_documents([doc.page_content])[0] for doc in docs]
        doc_embeddings = [emb/np.linalg.norm(emb) for emb in doc_embeddings]
        
        self.docs = docs
        self.doc_embeddings = doc_embeddings

    def retrieve(self, threshold):
        if not self.docs:
            return []
        selected_idx = [random.choice(range(len(self.docs)))]
        selected_embeddings = [self.doc_embeddings[selected_idx[0]]]
        
        remaining_indices = list(set(range(len(self.docs))) - {selected_idx[0]})
        while len(selected_idx) < self.k and remaining_indices:
            candidate_idx = random.choice(remaining_indices)
            candidate_embedding = self.doc_embeddings[candidate_idx]

            similarity_ok = all((1+cosine(candidate_embedding, emb))/2.0 < threshold for emb in selected_embeddings)
            
            if similarity_ok:
                selected_idx.append(candidate_idx)
                selected_embeddings.append(candidate_embedding)

            remaining_indices.remove(candidate_idx)

        selected_docs = [self.docs[idx] for idx in selected_idx]
        return selected_docs


class QAChain:
    def __init__(self, docs=None):
        self.prompt = ChatPromptTemplate.from_template(
            """Answer the question based only on the context provided as short as possible.

            Context: {context}

            Question: {question}"""
        )
        self.llm = ChatOpenAI(model="gpt-3.5-turbo")
        self.docs = docs # for evaluation only
        
        self.chain = (
            {"context": self.retrieve_docs, "question": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | StrOutputParser()
        ) 


    def retrieve_docs(self, query):
        if not self.docs:
            return ''
        return "\n\n".join(doc.page_content for doc in self.docs)

    
    def answer(self, question: str):
        return self.chain.invoke(question)

retriever=CustomWikipediaRetriever()
query = "who is the founder of quantum physics"
retriever.retrieve_with_embeddings(query)
ls = [0.7,0.3]
for th in ls:
    print(th)
    docs = retriever.retrieve(th)
    qa_chain = QAChain(docs=docs)
    print(qa_chain.answer(query))


  self.embedding_model = embedding_model or OpenAIEmbeddings()
  docs = self.retriever.get_relevant_documents(query)


0.7
Richard Phillips Feynman is credited as one of the founders of quantum physics.
0.3
Lev Artsimovich


In [6]:
th_test = [0, 0.3, 0.5, 0.6, 0.7, 0.75, 0.8, 0.83, 0.85, 0.88, 0.9, 0.92, 0.95, 0.98]
num_sample = 500
num_times = 3
rng = 42

In [7]:
eval = EvaluationMetrics()

In [8]:
from get_dataset import get_nq, get_tqa, get_squad, get_asqa

nq = get_nq()
tqa = get_tqa()
squad = get_squad()
asqa = get_asqa()


In [9]:
name_to_ds = {"NQ": nq, "TriviaQA": tqa, "SQuAD": squad, "ASQA": asqa}

def evaluate(name):
    ds = name_to_ds[name]
    os.makedirs("threshold_results", exist_ok=True)

    
    random.seed(rng)
    selected_idx = random.sample(range(ds.shape[0]), num_sample)
    
    n = len(th_test)
    
    candidates = [[] for _ in range(n)]
    references = []
    results = []
    
    for i in selected_idx:
        q = ds.loc[i, "question"]
        a = ds.loc[i, "answer"]
        retriever = CustomWikipediaRetriever()
        retriever.retrieve_with_embeddings(q)
        for j in range(n):
            th = th_test[j]
            c = []
            for _ in range(num_times):
                docs = retriever.retrieve(th)
                qa_chain = QAChain(docs=docs)
                answer = qa_chain.answer(q)
                c.append(answer)
                time.sleep(0.03)
                results.append({"idx in original dataset":i, "question":q , "similarity threshold": th, "num_times":_, "retrieved docs": docs, "generated answer": answer, "standard answer": a})
            candidates[j].append(c) 
        references.append(a)
    results = pd.DataFrame(results)
    results.to_csv(f'threshold_results/{name}.csv')
    
    rougeL, diversity = eval.cal_scores(candidates, references)
        
    with open(f'threshold_results/{name}.txt', 'w') as file:
        sys.stdout = file
        print("The top-k document k are:", th_test)
        print("rougeL score for different k is:", rougeL)
        print("diversity score for different k is:", diversity)
    sys.stdout = sys.__stdout__

    

In [10]:
datasets = ["NQ", "TriviaQA", "SQuAD", "ASQA"]

for ds in datasets:
    evaluate(ds)