In [1]:
import pandas as pd
import numpy as np
import os
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)

In [4]:
CHUNK_SIZES = [50, 100, 200, 500, 750, 1000]
CHUNK_OVERLAPS = [0]
DOCUMENT_PART_LEN_LIMIT = 500
FILE_READING = ['Text', 'Chapter', 'Paragraph']

In [12]:
VECTOR_STORES = [f for f in os.listdir("./vectorstore_test/")]

In [14]:
#Read vector stores
databases = {}
embeddings = SentenceTransformerEmbeddings(model_name="bert-large-nli-mean-tokens")
for vector_store in VECTOR_STORES:
    databases[vector_store] = FAISS.load_local("./vectorstore_test/" + vector_store, embeddings)

In [15]:
#Read and filter questions
qa = pd.read_json(path_or_buf="./gold_questions.jsonl", lines=True)
qa['documents'] = qa['documents'].apply(lambda x: x[0])
qa = qa[qa['documents'].isin(['SD.00014', 'CEP-036_D', 'CS-ROUTING', 'SD.00052_-', 'PS.50005-05_E', 'CEP.00018_no STL HARM', 'MS.90504', 'MS.50010', 'CEP.00018', 'PS.50010S_RevB-IT', 'PS.50010S', 'MS.90170', 'MS.50002', 'SD-A1333'])]
evaluation_questions = [0,1,2,3,4,5,6,14,15]
qa = qa[qa['question_id'].isin(evaluation_questions)]

In [18]:
#Calculate accuracies for different k
acc = []
for k in [1,5,10]:
    for vector_store in VECTOR_STORES:
        qa.loc[:, vector_store] = qa['question'].apply(lambda q: [(res.metadata["filename"], res.metadata.get("chapter","N/A")) for res in databases[vector_store].similarity_search(q, k=k)])
        acc.append({"vector_store": vector_store,"k": k, "accuracy": round(100*(qa.apply(lambda x: x['documents'] in [d[0] for d in x[vector_store]], axis=1)).sum()/qa.shape[0], 1)})
pd.DataFrame(acc)

Unnamed: 0,vector_store,k,accuracy
0,Chapter_limit500_chunk1000_overlap0,1,48.4
1,Paragraph_limit500_chunk1000_overlap0,1,41.9
2,Text_limit500_chunk1000_overlap0,1,25.8
3,Chapter_limit500_chunk1000_overlap0,5,74.2
4,Paragraph_limit500_chunk1000_overlap0,5,83.9
5,Text_limit500_chunk1000_overlap0,5,67.7
6,Chapter_limit500_chunk1000_overlap0,10,87.1
7,Paragraph_limit500_chunk1000_overlap0,10,90.3
8,Text_limit500_chunk1000_overlap0,10,83.9


In [None]:
#Manually validate query search
q_id = 2
vector_store = 'Chapter_limit500_chunk1000_overlap0'
k=20

q = qa.iloc[q_id][['question','ground_truth','documents','paragraphs_location']]
res = databases[vector_store].similarity_search(q['question'], k=k)

print(list(q))
pd.DataFrame([(r.page_content, r.metadata.get('filename','NA'), r.metadata.get('chapter','NA')) for r in res], columns=['context', 'document', 'chapter'])