# Retrieval_and_Rerank

## Reading the Documents for Embedding

In [1]:
import pandas as pd
import numpy as np

In [2]:
# "ANTIQUE is a non-factoid quesiton answering dataset based on the questions and answers of Yahoo! Webscope L6." 
# on "https://ir-datasets.com/antique.html"
# Documents: Short answer passages (from Yahoo Answers)
# Queries: Natural language questions (from Yahoo Answers)

#Read the csv file
df = pd.read_csv('antique_sample_404k.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403666 entries, 0 to 403665
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  403666 non-null  int64 
 1   doc_id      403666 non-null  object
 2   text        403662 non-null  object
dtypes: int64(1), object(2)
memory usage: 9.2+ MB


In [4]:
#Combining the document id and the document text
df['comb_text'] = df['doc_id'] + ', ' + df['text']

In [5]:
#Converting all the rows to string datatype
df1 = df[['comb_text']].astype(str) #.head(5000)

In [6]:
#Appending each document into a list
passages = []
for index, row in df.iterrows():
    passages.append(str(row['comb_text']))

In [7]:
print(type(passages))

<class 'list'>


In [8]:
passages[0:10]

["2020338_0, A small group of politicians believed strongly that the fact that Saddam Hussien remained in power after the first Gulf War was a signal of weakness to the rest of the world, one that invited attacks and terrorism. Shortly after taking power with George Bush in 2000 and after the attack on 9/11, they were able to use the terrorist attacks to justify war with Iraq on this basis and exaggerated threats of the development of weapons of mass destruction. The military strength of the U.S. and the brutality of Saddam's regime led them to imagine that the military and political victory would be relatively easy.",
 '2020338_1, Because there is a lot of oil in Iraq.',
 '2020338_2, It is tempting to say that the US invaded Iraq because it has lots of oil, but the US is not a country in a deep economic problem that capturing other country’s oil is an actual need for survival. It is more likely that the Iraq invading Kuwait scenario would fall under that assumption.. I think that the 

## Embedding the Documents

In [9]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 32 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


In [13]:
# Embedding the documents using Bi-Encoder
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/12615 [00:00<?, ?it/s]

In [17]:
#Store the corpus_embeddings as pickle file for backup
import pickle
with open('corpus_embeddings.pickle', 'wb') as pkl:
    pickle.dump(corpus_embeddings, pkl)

In [10]:
#Loading the corpus_embeddings.pickle file
import pickle
with open('corpus_embeddings.pickle', 'rb') as pkl:
    doc_embedding = pickle.load(pkl)

In [11]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(passages):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

  0%|          | 0/403666 [00:00<?, ?it/s]

In [12]:
# This function will search all the articles for passages that
# answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
        

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    # question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, doc_embedding, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))


In [15]:
search(query = "What causes severe swelling and pain in the knees?")

Input question: What causes severe swelling and pain in the knees?
Top-3 lexical search (BM25) hits
	22.685	3241109_2, Indomethacin is an anaglesic used for moderate to severe arthritis.. It aids in the reduction of swelling and pain.
	19.330	2606613_0, In a mild wrist sprain maybe slightly swollen and tender. And probably feel mild pain when you move it.. .      Severe pain swelling can change the shape of your wrist and you can may have some bruising (a black and blue discoloration)
	19.323	2818197_2, Tweety bird is right on.  I had one yesterday and aside from tenderness, I am fine.  Just make sure there is no severe pain, swelling or redness at the sight where the biopsy was done.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.756	3097310_4, I'm not sure about your other problems, but the only time my knees swell and hurt is when I pull and stretch the tendons.
	0.709	1658637_2, ahhh i had same problem once. I went to the dr's and i had lots of fluid around my knee.

## Calculating Recall

In [13]:
#Loading the Testing Queries
df_query_test = pd.read_csv('antique_query_test.csv')

In [14]:
df_query_test = df_query_test[['query_id','text']]

In [15]:
def search_cross_encoder(input_query):
    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(input_query, convert_to_tensor=True)
    # question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, doc_embedding, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[input_query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top hit from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:1]:
        output_answer = passages[hit['corpus_id']].replace("\n", " ")
        # print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
        return output_answer

In [16]:
search_cross_encoder(input_query = "Why doesn't the water fall off  earth if it's round?")


-------------------------

Top-3 Cross-Encoder Re-ranker hits


"714612_0, This goes along with the question of why don't we fall off the earth if it is round. The answer is because gravity is holding us (and the water) down."

In [None]:
#Getting answers for all the test queries
df_query_test['result'] = df_query_test['text'].apply(search_cross_encoder)


In [33]:
df_query_test.to_csv('test_answers.csv')

In [34]:
#Seperating the doc_id from the retrieved answers
import re
df_query_test['doc_id'] = df_query_test['result'].apply(lambda x: re.search(r'(\d+_?\d*)', x).group(1))


In [42]:
#Reading the Query Relevance File
df_qrel_test = pd.read_csv('antique_qurel_test.csv')

In [43]:
df_qrel_test = df_qrel_test[['query_id','doc_id', 'relevance']]

In [148]:
#Only considering the Query and Answer pair with highest relevance
relevant_instances = df_qrel_test[df_qrel_test['relevance'].isin([4])]


In [149]:
relevant_instances 

Unnamed: 0,query_id,doc_id,relevance
0,1964316,1964316_5,4
8,1964316,1964316_3,4
9,1964316,1964316_2,4
11,1964316,1964316_0,4
29,1964316,369616_4,4
...,...,...,...
6565,1262692,1262692_3,4
6567,1262692,1262692_5,4
6568,1262692,1262692_6,4
6572,1262692,986052_4,4


In [150]:
eval_ground_truth_df = relevant_instances[['query_id','doc_id']]

In [151]:
eval_predictions_df = df_query_test[['query_id','doc_id']]

In [152]:
import numpy as np
from sklearn.metrics import recall_score
import seaborn as sns
import matplotlib.pyplot as plt

In [153]:
eval_predictions_df = eval_predictions_df.drop_duplicates(subset='query_id')
eval_ground_truth_df = eval_ground_truth_df.drop_duplicates(subset='query_id')

In [154]:
# Merge the two dataframes on 'query_id' to align ground truth and predictions
merged_df = eval_predictions_df.merge(eval_ground_truth_df, on='query_id', how='left', suffixes=('_pred', '_truth'))

Recall: 0.125


In [155]:
merged_df

Unnamed: 0,query_id,doc_id_pred,doc_id_truth
0,3990512,3931664_2,3990512_1
1,714612,714612_0,714612_0
2,2528767,2528767_3,2528767_2
3,821387,821387_3,821387_0
4,1880028,1880028_0,1880028_3
...,...,...,...
195,2192891,2192891_1,2192891_7
196,4406669,209084_8,4406669_2
197,1582877,1582877_14,1582877_14
198,1340574,1340574_2,1340574_0


In [156]:
y_test = merged_df['doc_id_truth'].tolist()
y_pred = merged_df['doc_id_pred'].tolist()


In [None]:
import evaluate
recall_metric = evaluate.load("recall")

In [158]:
#Recall score
recall_score = recall_metric.compute(references=y_test, predictions=y_pred, average='micro')
print(recall_score)

{'recall': 0.125}
