In [2]:
import json

file_path = "../train.json"
with open(file_path, "r") as file:
    data = json.load(file)

In [7]:
# ONLY CHECKING FOR ONE ORGANIZATION, AND SEEING ONLY ONE QUESTION

# Select only the required parts from the json, and using id as key as it is unique

import tiktoken
import pandas as pd

reformatted_data = {}
for item in data:

    if item['filename'].split('/')[0] == "ETR":
            
        # Define empty list to add the data
        selected_data = []

        # First add the text and tables
        selected_data.append(f"""Pre-Text:{str(item['pre_text'])} \n
                                    Table: {str(item['table'])} \n
                                    Post-Text: {str(item['post_text'])}""")
        
        # Also adding questions for reference
        if item.get('qa'):
            selected_data.append(item['qa']['question'])
        else:
            # Different from other file
            selected_data.append(item['qa_0']['question'])

        reformatted_data [item['id']] = selected_data



# Choose the correct encoding based on the OpenAI model
encoding = tiktoken.get_encoding("cl100k_base") 

total_tokens = 0
for datakey in reformatted_data:
    current_tokens = len(encoding.encode(reformatted_data[datakey][0]))
    if current_tokens>8000:
        print("EXCEEDING:", current_tokens)
    total_tokens += current_tokens

print("TOTAL TOKENS: ", total_tokens)
print(f"Price for Embedding V3 Small -> {total_tokens*0.02/1000000}")
print(f"Price for Embedding V3 Large -> {total_tokens*0.13/1000000}")
print(f"")

dataframe_with_relevant_data = pd.DataFrame.from_dict(reformatted_data, orient="index", columns=["Context", "QuestionsList"])
dataframe_with_relevant_data.head()


TOTAL TOKENS:  148980
Price for Embedding V3 Small -> 0.0029796
Price for Embedding V3 Large -> 0.0193674



Unnamed: 0,Context,QuestionsList
Single_ETR/2008/page_336.pdf-3,"Pre-Text:['entergy mississippi , inc .', ""mana...",what is the percent change in net revenue betw...
Single_ETR/2011/page_22.pdf-3,"Pre-Text:[""entergy corporation and subsidiarie...",what was the percentage change of the net reve...
Single_ETR/2011/page_435.pdf-2,Pre-Text:['the target awards for the other nam...,what is actual operating cash flow reported fo...
Single_ETR/2004/page_20.pdf-2,"Pre-Text:[""entergy corporation and subsidiarie...",what is the growth rate in net revenue in 2003...
Single_ETR/2004/page_258.pdf-4,"Pre-Text:['entergy new orleans , inc .', ""mana...",what is the percent change in net revenue from...


In [9]:
print(dataframe_with_relevant_data.shape)
df_unique = dataframe_with_relevant_data.drop_duplicates(subset=['Context','QuestionsList'])
df_unique.shape

(198, 2)


(170, 2)

In [10]:
dataframe_with_relevant_data = df_unique

In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Setting the API Key
dotenv_path = os.path.abspath(os.path.join(os.path.dirname("__file__"), "../..", "OPENAI_KEY.env"))
load_dotenv(dotenv_path)


api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    api_key=api_key,
)


def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding


dataframe_with_relevant_data['embedding_small_context'] = dataframe_with_relevant_data.Context.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
dataframe_with_relevant_data['embedding_small_question'] = dataframe_with_relevant_data.QuestionsList.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))


In [None]:
# LARGE EMBEDDINGS


def get_embedding(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding


dataframe_with_relevant_data['embedding_large_context'] = dataframe_with_relevant_data.Context.apply(lambda x: get_embedding(x, model='text-embedding-3-large'))
dataframe_with_relevant_data['embedding_large_question'] = dataframe_with_relevant_data.QuestionsList.apply(lambda x: get_embedding(x, model='text-embedding-3-large'))



In [None]:
dataframe_with_relevant_data.to_parquet("ETR_DATA.parquet")

Recall with Small Embeddings 

In [19]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Sample DataFrame (use your actual DataFrame)
df = dataframe_with_relevant_data  # Your DataFrame

# Convert embedding columns to NumPy arrays
context_embeddings = np.vstack(df["embedding_small_context"].values)   # Shape: (num_contexts, embedding_dim)
question_embeddings = np.vstack(df["embedding_small_question"].values) # Shape: (num_questions, embedding_dim)

# Compute cosine similarity (each row in question_embeddings compared to all context embeddings)
similarity_matrix = cosine_similarity(question_embeddings, context_embeddings)  # Shape: (num_questions, num_contexts)

# Function to compute recall@K
def compute_recall_at_k(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0
    
    for i in range(num_questions):
        # Get indices of top K most similar context embeddings for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]  # Sort in descending order
        
        # Check if the correct context (same row in DataFrame) is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Evaluate recall for different values of K
k_values = [1, 3, 5, 10]
recall_scores = {f"Recall@{k}": compute_recall_at_k(similarity_matrix, k) for k in k_values}

# Print recall scores
print(recall_scores)


{'Recall@1': 0.11176470588235295, 'Recall@3': 0.2823529411764706, 'Recall@5': 0.3941176470588235, 'Recall@10': 0.5705882352941176}


Recall with Large Contextual Embeddings

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Sample DataFrame (use your actual DataFrame)
df = dataframe_with_relevant_data  # Your DataFrame

# Convert embedding columns to NumPy arrays
context_embeddings = np.vstack(df["embedding_large_context"].values)   # Shape: (num_contexts, embedding_dim)
question_embeddings = np.vstack(df["embedding_large_question"].values) # Shape: (num_questions, embedding_dim)

# Compute cosine similarity (each row in question_embeddings compared to all context embeddings)
similarity_matrix = cosine_similarity(question_embeddings, context_embeddings)  # Shape: (num_questions, num_contexts)

# Function to compute recall@K
def compute_recall_at_k(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0
    
    for i in range(num_questions):
        # Get indices of top K most similar context embeddings for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]  # Sort in descending order
        
        # Check if the correct context (same row in DataFrame) is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Evaluate recall for different values of K
k_values = [1, 3, 5, 10, 20]
recall_scores = {f"Recall@{k}": compute_recall_at_k(similarity_matrix, k) for k in k_values}

# Print recall scores
print(recall_scores)


{'Recall@1': 0.1411764705882353, 'Recall@3': 0.34705882352941175, 'Recall@5': 0.4294117647058823, 'Recall@10': 0.5470588235294118, 'Recall@20': 0.6823529411764706}


Recall with TF-IDF

In [21]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample DataFrame (Use your actual DataFrame)
df = dataframe_with_relevant_data  # Your dataset with 'Context' and 'QuestionsList'

# Step 1: Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit the TF-IDF vectorizer on the Contexts only (Do not train on questions)
vectorizer.fit(df["Context"].tolist())

# Step 3: Transform the contexts and questions (note that questions are transformed using the pre-fitted vectorizer)
context_tfidf = vectorizer.transform(df["Context"].tolist())  # Transform the contexts
question_tfidf = vectorizer.transform(df["QuestionsList"].tolist())  # Transform the questions

# Step 4: Compute cosine similarity (Lexical similarity between questions and contexts)
lexical_similarity_matrix = cosine_similarity(question_tfidf, context_tfidf)

# Step 5: Function to compute recall@K for lexical search
def compute_recall_at_k_lexical(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0

    for i in range(num_questions):
        # Get indices of top K most similar contexts for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]

        # Check if the correct context is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Step 6: Evaluate Recall@K for lexical search (for different values of k)
k_values = [1, 3, 5, 10]
lexical_recall_scores = {f"Lexical Recall@{k}": compute_recall_at_k_lexical(lexical_similarity_matrix, k) for k in k_values}

# Step 7: Print lexical recall scores
print("Lexical Search Recall Scores:", lexical_recall_scores)


Lexical Search Recall Scores: {'Lexical Recall@1': 0.1588235294117647, 'Lexical Recall@3': 0.4, 'Lexical Recall@5': 0.5294117647058824, 'Lexical Recall@10': 0.7235294117647059}


Hybrid Search Large Embeddings

In [23]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample DataFrame (Use your actual DataFrame)
df = dataframe_with_relevant_data  # Your dataset with 'Context', 'QuestionsList', 'embedding_small_context', 'embedding_small_question'

# Step 1: Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit the TF-IDF vectorizer on the Contexts only (Do not train on questions)
vectorizer.fit(df["Context"].tolist())

# Step 3: Transform the contexts and questions (note that questions are transformed using the pre-fitted vectorizer)
context_tfidf = vectorizer.transform(df["Context"].tolist())  # Transform the contexts
question_tfidf = vectorizer.transform(df["QuestionsList"].tolist())  # Transform the questions

# Step 4: Retrieve the embedding vectors for context and questions
# Assuming `embedding_small_context` and `embedding_small_question` are columns containing the precomputed embeddings
context_embeddings = np.vstack(df["embedding_large_context"].values)  # Context embeddings
question_embeddings = np.vstack(df["embedding_large_question"].values)  # Question embeddings

# Step 5: Combine TF-IDF and embeddings for context and questions
# Directly concatenate the TF-IDF vectors and embeddings
context_combined = np.hstack([context_tfidf.toarray(), context_embeddings])
question_combined = np.hstack([question_tfidf.toarray(), question_embeddings])

# Step 6: Compute cosine similarity (Lexical + Embedding similarity)
combined_similarity_matrix = cosine_similarity(question_combined, context_combined)

# Step 7: Function to compute recall@K for combined lexical and embedding search
def compute_recall_at_k_combined(similarity_matrix, k):
    num_questions = similarity_matrix.shape[0]
    correct_matches = 0

    for i in range(num_questions):
        # Get indices of top K most similar contexts for question i
        top_k_indices = np.argsort(similarity_matrix[i])[::-1][:k]

        # Check if the correct context is in the top K
        if i in top_k_indices:
            correct_matches += 1

    recall_at_k = correct_matches / num_questions
    return recall_at_k

# Step 8: Evaluate Recall@K for combined lexical and embedding search (for different values of k)
k_values = [1, 3, 5, 10, 20]
combined_recall_scores = {f"Combined Recall@{k}": compute_recall_at_k_combined(combined_similarity_matrix, k) for k in k_values}

# Step 9: Print combined recall scores
print("Combined Lexical + Embedding Search Recall Scores:", combined_recall_scores)


Combined Lexical + Embedding Search Recall Scores: {'Combined Recall@1': 0.22941176470588234, 'Combined Recall@3': 0.5, 'Combined Recall@5': 0.5882352941176471, 'Combined Recall@10': 0.7823529411764706, 'Combined Recall@20': 0.8764705882352941}
