# Setup

In [42]:
import os, json

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings

import numpy as np
import pandas as pd

In [53]:
MAIN_DIR = ".."
EMB_DIR = os.path.join(MAIN_DIR, "data", "emb_store", "uc", "faiss", "text-embedding-ada-002")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)
    
os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]

with open(os.path.join(MAIN_DIR, "data", "queries", "uc_all.txt"), "r") as f:
    test_cases_txt = f.readlines()
    
with open(os.path.join(MAIN_DIR, "data", "queries", "uc_all_emb.json"), "r") as f:
    test_cases_emb = json.load(f)
    
test_cases = [(txt, emb) for txt, emb in zip(test_cases_txt, test_cases_emb)]
print("Number of test cases:", len(test_cases))
print("Length of embeddings:", len(test_cases[0][1]))

Number of test cases: 30
Length of embeddings: 1536


# Inspect Single Database

In [37]:
embedding_function = OpenAIEmbeddings().embed_query
docsearch = FAISS.load_local(
    os.path.join(EMB_DIR, "v8-add-tables_2500_500"),
    OpenAIEmbeddings()
    )

In [52]:
k = 10

info = {
    "question": [],
    "average_score": [],
    "min_score": [],
    "max_score": []
}

for idx in range(10):
    info[f"Doc {idx+1} text"] = []
    info[f"Doc {idx+1} score"] = []

for test_case in test_cases:
    info["question"].append(test_case[0])
    relevant_docs_and_scores = docsearch.similarity_search_with_score_by_vector(test_case[1], k = k)
    scores = [doc_and_score[1] for doc_and_score in relevant_docs_and_scores]
    info["average_score"].append(np.mean(scores))
    info["min_score"].append(np.min(scores))
    info["max_score"].append(np.max(scores))
    for idx, doc_and_score in enumerate(relevant_docs_and_scores):
        doc, score = doc_and_score
        info[f"Doc {idx+1} text"].append(doc.page_content)
        info[f"Doc {idx+1} score"].append(score)
        
df = pd.DataFrame(info)
save_folder = os.path.join(MAIN_DIR, "artifacts", "similarity-search-analysis-text-embedding-ada-002")
if not os.path.exists(save_folder):
    os.makedirs(save_folder, exist_ok=True)
df.to_csv(os.path.join(save_folder, "summary.csv"),
          header=True)

# Compare Multiple Databases

In [59]:
docstores = [
    "v6-add-tables_750_100",
    "v7-add-tables_1000_200",
    "v8-add-tables_2500_500",
    "v9-add-tables_1500_300"
]

info = {
    "question": [test_case[0] for test_case in test_cases],
}

for docstore in docstores:
    docsearch = FAISS.load_local(
        os.path.join(EMB_DIR, docstore),
        OpenAIEmbeddings()
        )
    chunk_size = docstore.split("_")[-2]
    info[f"{chunk_size}_average_score"] = []
    info[f"{chunk_size}_min_score"] = []
    info[f"{chunk_size}_max_score"] = []
    
    for test_case in test_cases:
        relevant_docs_and_scores = docsearch.similarity_search_with_score_by_vector(test_case[1], k = k)
        scores = [doc_and_score[1] for doc_and_score in relevant_docs_and_scores]
        info[f"{chunk_size}_average_score"].append(np.mean(scores))
        info[f"{chunk_size}_min_score"].append(np.min(scores))
        info[f"{chunk_size}_max_score"].append(np.max(scores))
        
df_scores = pd.DataFrame(info)
save_folder = os.path.join(MAIN_DIR, "artifacts", "similarity-search-analysis-text-embedding-ada-002")
if not os.path.exists(save_folder):
    os.makedirs(save_folder, exist_ok=True)
    
df_scores.to_csv(os.path.join(save_folder, "compare_database.csv"),
                 header=True)