In [1]:
import pandas as pd

data_with_semantic_vectors_path = "../data/dense_embedddings_with_index.parquet"
data_with_tfidf_vectors_path = "../data/sparse_embeddings_with_index.pkl"

In [29]:
semantic_df = pd.read_parquet(data_with_semantic_vectors_path)
tfidf_df = pd.read_pickle(data_with_tfidf_vectors_path)
df_combined = pd.concat([semantic_df, tfidf_df], axis=1)
df_combined["Abbreviation"] = df_combined.index.str.split("/").str[0].str.split("_").str[-1]


In [3]:
df_combined

Unnamed: 0,Context,Question,Dialogue,embedding_context,embedding_question,Context_TFIDF_Vector
Single_STT/2013/page_54.pdf-4,"Pre-Text:[""shareholder return performance pres...",how much higher are the returns of the s&p 500...,what is the fraction change of the investment ...,"[0.016610916703939438, -0.0038285385817289352,...","[-0.014002669602632523, -0.0030809114687144756...",<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2011/page_94.pdf-3,Pre-Text:['we maintain an effective universal ...,what was the percent change in the value of co...,what was the value of commercial paper outstan...,"[-0.013737378641963005, -0.01657671481370926, ...","[-0.0014009354636073112, -0.013579033315181732...",<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2014/page_69.pdf-2,Pre-Text:['management 2019s discussion and ana...,what is the percentage change in the average t...,what was the value of average short term advan...,"[-0.008708413690328598, -0.007725966162979603,...","[-0.028621919453144073, 0.022226709872484207, ...",<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2009/page_122.pdf-4,"Pre-Text:['note 10 .', 'commitments and contin...",what is the percent change in asset purchase a...,what was the total in asset purchase agreement...,"[-0.007598159369081259, -0.0395529605448246, -...","[-0.020172208547592163, 0.010079036466777325, ...",<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2013/page_175.pdf-2,Pre-Text:['state street corporation notes to c...,what is the percentage change in the balance o...,what was the total in asset purchase agreement...,"[-0.0009236071491613984, -0.027679165825247765...","[-0.02635018341243267, 0.01037721149623394, -0...",<Compressed Sparse Row sparse matrix of dtype ...
...,...,...,...,...,...,...
Single_AMAT/2012/page_37.pdf-2,Pre-Text:['performance graph the performance g...,for how many common stock shares did the compa...,what was the product of the dividend paid per ...,"[-0.0008334789890795946, -0.01523605827242136,...","[-0.017676647752523422, -0.008252725005149841,...",<Compressed Sparse Row sparse matrix of dtype ...
Single_AMAT/2014/page_37.pdf-2,Pre-Text:['performance graph the performance g...,how many shares received dividends during 2014...,what is the yearly dividend per share in 2014?...,"[-0.0010122329695150256, -0.015447970479726791...","[-0.018255062401294708, 0.018669361248612404, ...",<Compressed Sparse Row sparse matrix of dtype ...
Single_AMAT/2015/page_14.pdf-2,Pre-Text:['backlog applied manufactures system...,what is the growth rate in the segment of disp...,what was the display value in 2015?\n-what was...,"[-0.003818386932834983, 0.015724772587418556, ...","[-0.010050210170447826, 0.015472985804080963, ...",<Compressed Sparse Row sparse matrix of dtype ...
Double_AMAT/2015/page_33.pdf,Pre-Text:['performance graph the performance g...,what is the yearly rate of return of s&p500 if...,what is the net change in value of an investme...,"[-0.005452048033475876, -0.012531572952866554,...","[-0.03288188576698303, -0.02652997523546219, -...",<Compressed Sparse Row sparse matrix of dtype ...


## Recall with only Contextual Embeddings

In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Sample DataFrame (use your actual DataFrame)
df = df_combined  # Your DataFrame

# Convert embedding columns to NumPy arrays
context_embeddings = np.vstack(df["embedding_context"].values)   # Shape: (num_contexts, embedding_dim)
question_embeddings = np.vstack(df["embedding_question"].values) # Shape: (num_questions, embedding_dim)

# Compute cosine similarity (each row in question_embeddings compared to all context embeddings)
similarity_matrix = cosine_similarity(question_embeddings, context_embeddings)  # Shape: (num_questions, num_contexts)

# Function to compute recall@K
def compute_recall_at_k(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0
    
    for i in range(num_questions):
        # Get indices of top K most similar context embeddings for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]  # Sort in descending order
        
        # Check if the correct context (same row in DataFrame) is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Evaluate recall for different values of K
k_values = [1, 3, 5, 10, 20, 40]
recall_scores = {f"Recall@{k}": compute_recall_at_k(similarity_matrix, k) for k in k_values}

# Print recall scores
print(recall_scores)


{'Recall@1': 0.2128463476070529, 'Recall@3': 0.3677581863979849, 'Recall@5': 0.44584382871536526, 'Recall@10': 0.5516372795969773, 'Recall@20': 0.6492443324937027, 'Recall@40': 0.75}


## Recall with TD-IDF Vectors only

In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample DataFrame (Use your actual DataFrame)
df = df_combined  # Your dataset with 'Context' and 'QuestionsList'

# Step 1: Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit the TF-IDF vectorizer on the Contexts only (Do not train on questions)
vectorizer.fit(df["Context"].tolist())

# Step 3: Transform the contexts and questions (note that questions are transformed using the pre-fitted vectorizer)
context_tfidf = vectorizer.transform(df["Context"].tolist())  # Transform the contexts
question_tfidf = vectorizer.transform(df["Question"].tolist())  # Transform the questions

# Step 4: Compute cosine similarity (Lexical similarity between questions and contexts)
lexical_similarity_matrix = cosine_similarity(question_tfidf, context_tfidf)

# Step 5: Function to compute recall@K for lexical search
def compute_recall_at_k_lexical(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0

    for i in range(num_questions):
        # Get indices of top K most similar contexts for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]

        # Check if the correct context is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Step 6: Evaluate Recall@K for lexical search (for different values of k)
k_values = [1, 3, 5, 10, 20, 40]
lexical_recall_scores = {f"Recall@{k}": compute_recall_at_k_lexical(lexical_similarity_matrix, k) for k in k_values}

# Step 7: Print lexical recall scores
print("Lexical Search Recall Scores:", lexical_recall_scores)


Lexical Search Recall Scores: {'Recall@1': 0.24181360201511334, 'Recall@3': 0.3973551637279597, 'Recall@5': 0.47858942065491183, 'Recall@10': 0.5976070528967254, 'Recall@20': 0.7052896725440806, 'Recall@40': 0.8047858942065491}


## Hybrid Search Recall

In [20]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample DataFrame (Use your actual DataFrame)
df = df_combined  # Your dataset with 'Context', 'QuestionsList', 'embedding_small_context', 'embedding_small_question'

# Step 1: Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit the TF-IDF vectorizer on the Contexts only (Do not train on questions)
vectorizer.fit(df["Context"].tolist())

# Step 3: Transform the contexts and questions (note that questions are transformed using the pre-fitted vectorizer)
context_tfidf = vectorizer.transform(df["Context"].tolist())  # Transform the contexts
question_tfidf = vectorizer.transform(df["Question"].tolist())  # Transform the questions

# Step 4: Retrieve the embedding vectors for context and questions
# Assuming `embedding_small_context` and `embedding_small_question` are columns containing the precomputed embeddings
context_embeddings = np.vstack(df["embedding_context"].values)  # Context embeddings
question_embeddings = np.vstack(df["embedding_question"].values)  # Question embeddings

# Step 5: Combine TF-IDF and embeddings for context and questions
# Directly concatenate the TF-IDF vectors and embeddings
context_combined = np.hstack([context_tfidf.toarray(), context_embeddings])
question_combined = np.hstack([question_tfidf.toarray(), question_embeddings])

# Step 6: Compute cosine similarity (Lexical + Embedding similarity)
combined_similarity_matrix = cosine_similarity(question_combined, context_combined)

# Step 7: Function to compute recall@K for combined lexical and embedding search
def compute_recall_at_k_combined(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0

    for i in range(num_questions):
        # Get indices of top K most similar contexts for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]

        # Check if the correct context is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Step 8: Evaluate Recall@K for combined lexical and embedding search (for different values of k)
k_values = [1, 3, 5, 10, 20, 40]
combined_recall_scores = {f"Recall@{k}": compute_recall_at_k_combined(combined_similarity_matrix, k) for k in k_values}

# Step 9: Print combined recall scores
print("Combined Lexical + Embedding Search Recall Scores:", combined_recall_scores)


Combined Lexical + Embedding Search Recall Scores: {'Recall@1': 0.28463476070528965, 'Recall@3': 0.464735516372796, 'Recall@5': 0.5434508816120907, 'Recall@10': 0.6511335012594458, 'Recall@20': 0.7651133501259446, 'Recall@40': 0.8589420654911839}


In [21]:
recall_scores

{'Recall@1': 0.2128463476070529,
 'Recall@3': 0.3677581863979849,
 'Recall@5': 0.44584382871536526,
 'Recall@10': 0.5516372795969773,
 'Recall@20': 0.6492443324937027,
 'Recall@40': 0.75}

In [22]:
lexical_recall_scores

{'Recall@1': 0.24181360201511334,
 'Recall@3': 0.3973551637279597,
 'Recall@5': 0.47858942065491183,
 'Recall@10': 0.5976070528967254,
 'Recall@20': 0.7052896725440806,
 'Recall@40': 0.8047858942065491}

In [23]:
combined_recall_scores

{'Recall@1': 0.28463476070528965,
 'Recall@3': 0.464735516372796,
 'Recall@5': 0.5434508816120907,
 'Recall@10': 0.6511335012594458,
 'Recall@20': 0.7651133501259446,
 'Recall@40': 0.8589420654911839}

In [26]:
final_resutls = pd.DataFrame([recall_scores, lexical_recall_scores, combined_recall_scores])
final_resutls.index = ["OpenAI Large Embeddings", "TF-IDF", "Hybrid"]
final_resutls

Unnamed: 0,Recall@1,Recall@3,Recall@5,Recall@10,Recall@20,Recall@40
OpenAI Large Embeddings,0.212846,0.367758,0.445844,0.551637,0.649244,0.75
TF-IDF,0.241814,0.397355,0.478589,0.597607,0.70529,0.804786
Hybrid,0.284635,0.464736,0.543451,0.651134,0.765113,0.858942


In [None]:
# final_resutls.to_excel("answers.xlsx")

## Grouped by Organization, then taking average. 

In [39]:
# Embedding Only Search
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute recall@K
def compute_recall_at_k(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0
    
    for i in range(num_questions):
        # Get indices of top K most similar context embeddings for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]  # Sort in descending order
        
        # Check if the correct context (same row in DataFrame) is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Define k values
k_values = [1, 3, 5, 10, 20, 40]

# Dictionary to store recall scores for each abbreviation
recall_scores_list = []

# Group by "Abbreviation" and compute recall scores separately
for abbreviation, group in df_combined.groupby("Abbreviation"):
    # Convert embedding columns to NumPy arrays
    context_embeddings = np.vstack(group["embedding_context"].values)   # Shape: (num_contexts, embedding_dim)
    question_embeddings = np.vstack(group["embedding_question"].values) # Shape: (num_questions, embedding_dim)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(question_embeddings, context_embeddings)

    # Compute recall@K for the group
    recall_scores = {f"Recall@{k}": compute_recall_at_k(similarity_matrix, k) for k in k_values}
    recall_scores["Abbreviation"] = abbreviation  # Store abbreviation

    # Append to list
    recall_scores_list.append(recall_scores)

# Convert list of scores to DataFrame
recall_scores_df = pd.DataFrame(recall_scores_list)

# Compute the average recall scores across abbreviations
average_recall_scores_embedding = recall_scores_df.drop(columns=["Abbreviation"]).mean().to_dict()



In [40]:
# TF-IDF only. 

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute recall@K for lexical search
def compute_recall_at_k_lexical(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0

    for i in range(num_questions):
        # Get indices of top K most similar contexts for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]

        # Check if the correct context is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

vectorizer = TfidfVectorizer()
vectorizer.fit(df_combined["Context"].tolist())

# Define k values
k_values = [1, 3, 5, 10, 20, 40]

# Dictionary to store recall scores for each abbreviation
recall_scores_list = []

# Group by "Abbreviation" and compute recall scores separately
for abbreviation, group in df_combined.groupby("Abbreviation"):
    # Initialize the TF-IDF Vectorizer and fit only on the contexts within the group
    

    # Transform contexts and questions within the group
    context_tfidf = vectorizer.transform(group["Context"].tolist())
    question_tfidf = vectorizer.transform(group["Question"].tolist())

    # Compute cosine similarity (Lexical similarity between questions and contexts)
    lexical_similarity_matrix = cosine_similarity(question_tfidf, context_tfidf)

    # Compute recall@K for lexical search in this group
    recall_scores = {f"Recall@{k}": compute_recall_at_k_lexical(lexical_similarity_matrix, k) for k in k_values}
    recall_scores["Abbreviation"] = abbreviation  # Store abbreviation

    # Append to list
    recall_scores_list.append(recall_scores)

# Convert list of scores to DataFrame
recall_scores_df = pd.DataFrame(recall_scores_list)

# Compute the average recall scores across abbreviations
average_recall_scores_tfidf = recall_scores_df.drop(columns=["Abbreviation"]).mean().to_dict()



In [41]:
# Hybrid Search
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute recall@K
def compute_recall_at_k_combined(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0

    for i in range(num_questions):
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]
        if i in top_k_indices:
            correct_matches += 1

    return correct_matches / num_questions

vectorizer = TfidfVectorizer()
vectorizer.fit(df_combined["Context"].tolist())

# Define k values
k_values = [1, 3, 5, 10, 20, 40]

# Dictionary to store recall scores for each abbreviation
recall_scores_list = []

# Group by "Abbreviation" and compute recall scores separately
for abbreviation, group in df_combined.groupby("Abbreviation"):
    # Initialize TF-IDF Vectorizer and fit on contexts within the group

    # Transform contexts and questions within the group
    context_tfidf = vectorizer.transform(group["Context"].tolist())
    question_tfidf = vectorizer.transform(group["Question"].tolist())

    # Retrieve embeddings
    context_embeddings = np.vstack(group["embedding_context"].values)
    question_embeddings = np.vstack(group["embedding_question"].values)

    # Combine TF-IDF and embeddings
    context_combined = np.hstack([context_tfidf.toarray(), context_embeddings])
    question_combined = np.hstack([question_tfidf.toarray(), question_embeddings])

    # Compute similarity matrix
    similarity_matrix = cosine_similarity(question_combined, context_combined)

    # Compute recall@K for the group
    recall_scores = {f"Recall@{k}": compute_recall_at_k_combined(similarity_matrix, k) for k in k_values}
    recall_scores["Abbreviation"] = abbreviation  # Store abbreviation

    # Append to list
    recall_scores_list.append(recall_scores)

# Convert list of scores to DataFrame
recall_scores_df = pd.DataFrame(recall_scores_list)

# Compute the average recall scores across abbreviations
average_recall_scores_hybrid = recall_scores_df.drop(columns=["Abbreviation"]).mean().to_dict()


In [None]:
average_resutls = pd.DataFrame([average_recall_scores_embedding, average_recall_scores_tfidf, average_recall_scores_hybrid])
average_resutls.index = ["OpenAI Large Embeddings", "TF-IDF", "Hybrid"]
average_resutls