In [1]:
import json

file_path = "../../train.json"
with open(file_path, "r") as file:
    data = json.load(file)

In [2]:
all_orgs = list(set([item['filename'].split('/')[0] for item in data]))
len(all_orgs)

133

In [3]:
# making "id" the key as it is unique for all the documents
reformatted_data = {}
for org in all_orgs:
    for item in data:
        if item['filename'].split('/')[0] == org:
                
            # Define empty list to add the data
            selected_data = []

            # First add the text and tables
            selected_data.append(f"""Pre-Text:{str(item['pre_text'])} \n
                                        Table: {str(item['table'])} \n
                                        Post-Text: {str(item['post_text'])}""")
            
            # Also adding questions for reference
            if item.get('qa'):
                selected_data.append(item['qa']['question'])
            else:
                # If multiple questions present, just adding one for analysis
                # In answering user question, only context will be used, not questions
                # Question is being added just for analysis
                selected_data.append(item['qa_0']['question'])

            # Also adding the dialogue break for in-context learning
            selected_data.append(f"""{'\n-'.join(item['annotation']['dialogue_break'])}""")

            reformatted_data [item['id']] = selected_data



In [4]:
import pandas as pd

all_docs_dataframe = pd.DataFrame.from_dict(reformatted_data, orient='index', columns=["Context", "Question", "Dialogue"])
all_docs_dataframe

Unnamed: 0,Context,Question,Dialogue
Single_STT/2013/page_54.pdf-4,"Pre-Text:[""shareholder return performance pres...",how much higher are the returns of the s&p 500...,what is the fraction change of the investment ...
Single_STT/2011/page_94.pdf-3,Pre-Text:['we maintain an effective universal ...,what was the percent change in the value of co...,what was the value of commercial paper outstan...
Single_STT/2014/page_69.pdf-2,Pre-Text:['management 2019s discussion and ana...,what is the percentage change in the average t...,what was the value of average short term advan...
Single_STT/2009/page_122.pdf-4,"Pre-Text:['note 10 .', 'commitments and contin...",what is the percent change in asset purchase a...,what was the total in asset purchase agreement...
Single_STT/2013/page_175.pdf-2,Pre-Text:['state street corporation notes to c...,what is the percentage change in the balance o...,what was the total in asset purchase agreement...
...,...,...,...
Double_AMAT/2018/page_31.pdf,Pre-Text:['item 2 : properties information con...,what portion of total company used area is com...,what portion of the total area the company use...
Single_AMAT/2012/page_37.pdf-1,Pre-Text:['performance graph the performance g...,what is the roi of s&p500 if the investment ta...,what was the change in the value of s&p500 con...
Single_AMAT/2014/page_37.pdf-3,Pre-Text:['performance graph the performance g...,how much more return was given for investing i...,what is the change in value of an investment i...
Single_AMAT/2015/page_14.pdf-1,Pre-Text:['backlog applied manufactures system...,how much percentage has backlog increased from...,what was the change in the backlog from 2014 t...


In [5]:
# If there are duplicate contexts present in the docs, remove them

all_docs_dataframe_unique = all_docs_dataframe.drop_duplicates(subset=['Context'])
all_docs_dataframe_unique

Unnamed: 0,Context,Question,Dialogue
Single_STT/2013/page_54.pdf-4,"Pre-Text:[""shareholder return performance pres...",how much higher are the returns of the s&p 500...,what is the fraction change of the investment ...
Single_STT/2011/page_94.pdf-3,Pre-Text:['we maintain an effective universal ...,what was the percent change in the value of co...,what was the value of commercial paper outstan...
Single_STT/2014/page_69.pdf-2,Pre-Text:['management 2019s discussion and ana...,what is the percentage change in the average t...,what was the value of average short term advan...
Single_STT/2009/page_122.pdf-4,"Pre-Text:['note 10 .', 'commitments and contin...",what is the percent change in asset purchase a...,what was the total in asset purchase agreement...
Single_STT/2013/page_175.pdf-2,Pre-Text:['state street corporation notes to c...,what is the percentage change in the balance o...,what was the total in asset purchase agreement...
...,...,...,...
Single_AMAT/2012/page_37.pdf-2,Pre-Text:['performance graph the performance g...,for how many common stock shares did the compa...,what was the product of the dividend paid per ...
Single_AMAT/2014/page_37.pdf-2,Pre-Text:['performance graph the performance g...,how many shares received dividends during 2014...,what is the yearly dividend per share in 2014?...
Single_AMAT/2015/page_14.pdf-2,Pre-Text:['backlog applied manufactures system...,what is the growth rate in the segment of disp...,what was the display value in 2015?\n-what was...
Double_AMAT/2015/page_33.pdf,Pre-Text:['performance graph the performance g...,what is the yearly rate of return of s&p500 if...,what is the net change in value of an investme...


In [6]:
# Create TF-IDF vectors of the context and question (Question is for analysis only)
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(all_docs_dataframe_unique["Context"].tolist())
context_tfidf = vectorizer.transform(all_docs_dataframe_unique["Context"].tolist())  # Transform the contexts

all_docs_dataframe_unique["Context_TFIDF_Vector"] = list(context_tfidf)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_docs_dataframe_unique["Context_TFIDF_Vector"] = list(context_tfidf)


In [7]:
all_docs_dataframe_unique

Unnamed: 0,Context,Question,Dialogue,Context_TFIDF_Vector
Single_STT/2013/page_54.pdf-4,"Pre-Text:[""shareholder return performance pres...",how much higher are the returns of the s&p 500...,what is the fraction change of the investment ...,<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2011/page_94.pdf-3,Pre-Text:['we maintain an effective universal ...,what was the percent change in the value of co...,what was the value of commercial paper outstan...,<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2014/page_69.pdf-2,Pre-Text:['management 2019s discussion and ana...,what is the percentage change in the average t...,what was the value of average short term advan...,<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2009/page_122.pdf-4,"Pre-Text:['note 10 .', 'commitments and contin...",what is the percent change in asset purchase a...,what was the total in asset purchase agreement...,<Compressed Sparse Row sparse matrix of dtype ...
Single_STT/2013/page_175.pdf-2,Pre-Text:['state street corporation notes to c...,what is the percentage change in the balance o...,what was the total in asset purchase agreement...,<Compressed Sparse Row sparse matrix of dtype ...
...,...,...,...,...
Single_AMAT/2012/page_37.pdf-2,Pre-Text:['performance graph the performance g...,for how many common stock shares did the compa...,what was the product of the dividend paid per ...,<Compressed Sparse Row sparse matrix of dtype ...
Single_AMAT/2014/page_37.pdf-2,Pre-Text:['performance graph the performance g...,how many shares received dividends during 2014...,what is the yearly dividend per share in 2014?...,<Compressed Sparse Row sparse matrix of dtype ...
Single_AMAT/2015/page_14.pdf-2,Pre-Text:['backlog applied manufactures system...,what is the growth rate in the segment of disp...,what was the display value in 2015?\n-what was...,<Compressed Sparse Row sparse matrix of dtype ...
Double_AMAT/2015/page_33.pdf,Pre-Text:['performance graph the performance g...,what is the yearly rate of return of s&p500 if...,what is the net change in value of an investme...,<Compressed Sparse Row sparse matrix of dtype ...


In [8]:
# Create contextual embeddings for which will be used in Hybrid Search
# Questions embeddings are only for analysis

from openai import OpenAI
from dotenv import load_dotenv
import os

# Setting the API Key
dotenv_path = os.path.abspath(os.path.join(os.path.dirname("__file__"), "../../..", "OPENAI_KEY.env"))
load_dotenv(dotenv_path)


api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    api_key=api_key,
)



def get_embedding(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding


all_docs_dataframe_unique['embedding_context'] = all_docs_dataframe_unique.Context.apply(lambda x: get_embedding(x, model='text-embedding-3-large'))
all_docs_dataframe_unique['embedding_question'] = all_docs_dataframe_unique.Question.apply(lambda x: get_embedding(x, model='text-embedding-3-large'))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_docs_dataframe_unique['embedding_context'] = all_docs_dataframe_unique.Context.apply(lambda x: get_embedding(x, model='text-embedding-3-large'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_docs_dataframe_unique['embedding_question'] = all_docs_dataframe_unique.Question.apply(lambda x: get_embedding(x, model='text-embedding-3-large'))


In [17]:
df_for_saving = all_docs_dataframe_unique.copy()

In [18]:
df_for_saving

Unnamed: 0,Context,Question,Dialogue,Context_TFIDF_Vector,embedding_context,embedding_question
Single_STT/2013/page_54.pdf-4,"Pre-Text:[""shareholder return performance pres...",how much higher are the returns of the s&p 500...,what is the fraction change of the investment ...,<Compressed Sparse Row sparse matrix of dtype ...,"[0.016610916703939438, -0.0038285385817289352,...","[-0.014002669602632523, -0.0030809114687144756..."
Single_STT/2011/page_94.pdf-3,Pre-Text:['we maintain an effective universal ...,what was the percent change in the value of co...,what was the value of commercial paper outstan...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.013737378641963005, -0.01657671481370926, ...","[-0.0014009354636073112, -0.013579033315181732..."
Single_STT/2014/page_69.pdf-2,Pre-Text:['management 2019s discussion and ana...,what is the percentage change in the average t...,what was the value of average short term advan...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.008708413690328598, -0.007725966162979603,...","[-0.028621919453144073, 0.022226709872484207, ..."
Single_STT/2009/page_122.pdf-4,"Pre-Text:['note 10 .', 'commitments and contin...",what is the percent change in asset purchase a...,what was the total in asset purchase agreement...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.007598159369081259, -0.0395529605448246, -...","[-0.020172208547592163, 0.010079036466777325, ..."
Single_STT/2013/page_175.pdf-2,Pre-Text:['state street corporation notes to c...,what is the percentage change in the balance o...,what was the total in asset purchase agreement...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.0009236071491613984, -0.027679165825247765...","[-0.02635018341243267, 0.01037721149623394, -0..."
...,...,...,...,...,...,...
Single_AMAT/2012/page_37.pdf-2,Pre-Text:['performance graph the performance g...,for how many common stock shares did the compa...,what was the product of the dividend paid per ...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.0008334789890795946, -0.01523605827242136,...","[-0.017676647752523422, -0.008252725005149841,..."
Single_AMAT/2014/page_37.pdf-2,Pre-Text:['performance graph the performance g...,how many shares received dividends during 2014...,what is the yearly dividend per share in 2014?...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.0010122329695150256, -0.015447970479726791...","[-0.018255062401294708, 0.018669361248612404, ..."
Single_AMAT/2015/page_14.pdf-2,Pre-Text:['backlog applied manufactures system...,what is the growth rate in the segment of disp...,what was the display value in 2015?\n-what was...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.003818386932834983, 0.015724772587418556, ...","[-0.010050210170447826, 0.015472985804080963, ..."
Double_AMAT/2015/page_33.pdf,Pre-Text:['performance graph the performance g...,what is the yearly rate of return of s&p500 if...,what is the net change in value of an investme...,<Compressed Sparse Row sparse matrix of dtype ...,"[-0.005452048033475876, -0.012531572952866554,...","[-0.03288188576698303, -0.02652997523546219, -..."


In [19]:
df_for_saving_dense_embeddings = df_for_saving[['Context', "Question","Dialogue","embedding_context", "embedding_question"]]
df_for_saving_sparse_embeddings = df_for_saving[["Context_TFIDF_Vector"]]

In [32]:
df_for_saving_dense_embeddings.to_parquet('dense_embedddings_with_index.parquet')

In [21]:
import pickle

with open('sparse_embeddings_with_index.pkl', 'wb') as f:
    pickle.dump(df_for_saving_sparse_embeddings, f)