# Similarity Scores Calculation

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pandas as pd
import logging
import sys
from sentence_transformers import SentenceTransformer, util
import torch

# Configure logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


In [2]:
# Read and process query, document, and relevance data
df_queries = pd.read_csv('antique_query_test.csv')
df_queries = df_queries[['query_id','text']]

df_docs = pd.read_csv('antique_sample_404k.csv')
df_docs = df_docs[['doc_id','text']]

df_qrel = pd.read_csv('antique_qurel_test.csv')
df_qrel = df_qrel[['query_id','doc_id','relevance']]

# Merge relevant data for query and document
merged_df = df_qrel.merge(df_docs, on='doc_id', how='left')

# Extract text data from merged DataFrame
df_text = merged_df[['doc_id','text']]

# Initialize an empty list to store passages
passages = []

# Iterate through each row in the 'df_text' DataFrame and append text to the 'passages' list
for index, row in df_text.iterrows():
    passages.append(str(row['text']))

In [3]:
# Load pre-computed document embeddings
import pickle
with open('corpus_embeddings_text_768.pickle', 'rb') as pkl:
    doc_embedding = pickle.load(pkl)

In [4]:
# Initialize SentenceTransformer for embedding
bi_encoder = SentenceTransformer('intfloat/e5-base-v2')
# bi_encoder.max_seq_length = 512     #Truncate long passages to 512 tokens
top_k = 10                          #Number of passages we want to retrieve with the bi-encoder

Load pretrained SentenceTransformer: intfloat/e5-base-v2
Created a temporary directory at /var/folders/75/0dtb1gc52pdfr40bnpp97qj00000gn/T/tmp4tw5kn7b
Writing /var/folders/75/0dtb1gc52pdfr40bnpp97qj00000gn/T/tmp4tw5kn7b/_remote_module_non_scriptable.py
Use pytorch device: cpu


In [5]:
def search(query_id,input_query):  
    
    data = []

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(input_query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, doc_embedding, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query
    
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits:
        query_id = query_id
        score = hit['score']
        text = passages[hit['corpus_id']].replace("\n", " ")
        doc_id = df_text.iloc[hit['corpus_id']]['doc_id']  # Get the doc_id
        
        data.append({'query_id': query_id, 'doc_id': doc_id, 'score': score, 'text': text})
                
    return data  # Return the lists of output scores and passages

In [None]:
# Create a dataframe from the output answers
df_answers = pd.DataFrame()
df_answers['text'] = df_queries.apply(lambda row: search(row['query_id'], row['text']), axis=1)


In [7]:
df_answers.head()

Unnamed: 0,text
0,"[{'query_id': 3990512, 'doc_id': '3265991_12',..."
1,"[{'query_id': 714612, 'doc_id': '714612_7', 's..."
2,"[{'query_id': 2528767, 'doc_id': '2528767_3', ..."
3,"[{'query_id': 821387, 'doc_id': '1082990_5', '..."
4,"[{'query_id': 1880028, 'doc_id': '1880028_0', ..."


In [8]:
def flatten_dataframe(df):
    # Convert the values of the DataFrame to a list
    answers = df.values.tolist()

    # Create an empty list to store dictionaries from the sublist
    dict_list = []

    # Iterate through each sublist in the 'answers' list
    for sublist in answers:
        # Iterate through each dictionary in the sublist
        for dictionary in sublist:
            # Extend the 'dict_list' with the contents of the current dictionary
            dict_list.extend(dictionary)

    # Create a new DataFrame from the flattened dictionary list
    df_flattened = pd.DataFrame(dict_list)
    return df_flattened


In [9]:
df_answers_1 = flatten_dataframe(df_answers)

In [10]:
df_answers_1.head()

Unnamed: 0,query_id,doc_id,score,text
0,3990512,3265991_12,0.874376,concentration
1,3990512,2036065_1,0.864214,just by concentration
2,3990512,1900286_7,0.853578,It might be harder for concentration..you will...
3,3990512,248974_2,0.851697,"With concentration, you would do something lik..."
4,3990512,311770_3,0.846545,We have to put our mind into whatever we do 'c...


In [None]:
#Store the scores in a csv file
df_answers_1 = df_answers_1[['query_id','doc_id','score']]
df_answers_1.to_csv('e5_base_v2_scores.csv')