In [2]:
import pandas as pd
import numpy as np
import os
import json
import re

In [3]:
PATH_TO_EMBEDS = '/home/aayush/Documents/pclub_secy/task_4/compressed_array.npz'
PATH_TO_DF = '/home/aayush/Documents/pclub_secy/task_4/compressed_dataframe.csv.gz'

In [4]:
def run_faiss_search(query_text, top_k):
    query = [query_text]
    query_embedding = model.encode(query)
    scores, index_vals = faiss_index.search(query_embedding, top_k)
    index_vals_list = index_vals[0]
    
    return index_vals_list
    

def run_rerank(index_vals_list, query_text):  
    chunk_list = list(df_data['paragraph'])
    pred_strings_list = [chunk_list[item] for item in index_vals_list]
    cross_input_list = []
    for item in pred_strings_list:
        new_list = [query_text, item]
        cross_input_list.append(new_list)
    df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])
    df['original_index'] = index_vals_list
    cross_scores = cross_encoder.predict(cross_input_list)
    df['cross_scores'] = cross_scores
    df_sorted = df.sort_values(by='cross_scores', ascending=False)
    df_sorted = df_sorted.reset_index(drop=True)

    pred_list = []

    for i in range(0,len(df_sorted)):
        text = df_sorted.loc[i, 'pred_text']
        original_index = df_sorted.loc[i, 'original_index']
        item = {
            'abstract': text
        }
        pred_list.append(item)
    return pred_list


def print_search_results(pred_list, num_results_to_print):
    for i in range(0,num_results_to_print):      
        pred_dict = pred_list[i]
        para = pred_dict['abstract']
        print('Abstract:',para)
        print()
    
   
def run_search(query_text, num_results_to_print, top_k=5):
    pred_index_list = run_faiss_search(query_text, top_k)
    pred_list = run_rerank(pred_index_list, query_text)
    return pred_list

In [5]:
embeddings = np.load(PATH_TO_EMBEDS)
embeddings = embeddings['array_data']
embeddings.shape

(14756, 384)

In [6]:
df_data = pd.read_csv(PATH_TO_DF, compression='gzip')
print(df_data.shape)

(14756, 3)


In [7]:
import faiss
embed_length = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(embed_length)
faiss_index.add(embeddings)
faiss_index.is_trained

True

In [8]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [34]:
query_csv = pd.read_csv('queries.csv')

In [35]:
query_csv.head()

Unnamed: 0,Question,Answer_possible,Answer_text
0,What is the parts of the trophy?,True,"['-the cup itself, plus a lid and a base']"
1,"If you need a date of four days less, what do ...",True,['-4']
2,What was Hanover's attitude toward Austria?,True,['-belligerent Hanover']
3,What is an example of a suffix used by Old Eng...,True,['-de']
4,The Modern English ending -'s is derived from ...,True,['-es']


In [36]:
query_csv["Question"] = query_csv["Question"].str.replace("'", "\\'", regex=False)
query_csv["Question"] = query_csv["Question"].str.replace('"', '\\"', regex=False)

In [38]:
ques_list = list(query_csv["Question"])

In [39]:
query_csv["Answer_text"] = query_csv["Answer_text"].str.replace("'", "\\'", regex=False)
query_csv["Answer_text"] = query_csv["Answer_text"].str.replace('"', '\\"', regex=False)
ans_list = []
ans_list = list(query_csv["Answer_text"])

In [40]:
def stripper(s):
    pattern = re.compile(r'^[^a-zA-Z0-9?!]+|[^a-zA-Z0-9?!]+$')
    while pattern.match(s):
        s = pattern.sub('', s)
    
    return s

In [41]:
ct = 0
for i in range(len(ans_list)):
    ans_list[i] = stripper(ans_list[i])

In [42]:
ans_list

['the cup itself, plus a lid and a base',
 '4',
 'belligerent Hanover',
 'de',
 'es',
 'FA Cup matches are shown live by ITV across England and Wales, with UTV broadcasting to Northern Ireland',
 'generally that of the culture in which they were raised',
 'it becomes law without his signature',
 'it may be no more than fortuitous that more witnesses have survived that present a particular reading',
 'New Mexico Spanish, and in particular the Spanish of northern New Mexico and Colorado has retained many elements of 16th- and 17th-century Spanish',
 'researchers have adapted and developed a number of measures for assessing both infants’ recognition memory and their recall memory',
 'the British',
 'the continual influx of Spanish-speaking immigrants increased the import of Spanish in Texas',
 'the Cubs',
 'the presentation is made at the Royal Box',
 'the system remains extremely underdeveloped and poor, with severe shortages of medical supplies',
 'their arousal patterns being in line w

In [18]:
top_5_list = []
top_list = []

In [43]:
ans_list[39]

'you end up getting us stuck in a war in Iraq. Just ask President Bush'

In [25]:
query = "What was Kerry supposed to say when he \'botched a joke\'?"
resp = run_search(query,5)

In [26]:
resp

[{'abstract': 'Kerry said that he had intended the remark as a jab at President Bush, and described the remarks as a \\"botched joke\\", having inadvertently left out the key word \\"us\\" (which would have been, \\"If you don\\\'t, you get us stuck in Iraq\\"), as well as leaving the phrase \\"just ask President Bush\\" off of the end of the sentence. In Kerry\\\'s prepared remarks, which he released during the ensuing media frenzy, the corresponding line was \\"... you end up getting us stuck in a war in Iraq. Just ask President Bush.\\" He also said that from the context of the speech which, prior to the \\"stuck in Iraq\\" line, made several specific references to Bush and elements of his biography, that Kerry was referring to President Bush and not American troops in general.'},
 {'abstract': 'In the lead up to the Iraq War, Kerry said on October 9, 2002; \\"I will be voting to give the President of the United States the authority to use force, if necessary, to disarm Saddam Husse

In [32]:
s = 'John Kerry selected John Edwards'
for c in resp:
    if s in c['abstract']:
        print(c)

{'abstract': 'On July 6, John Kerry selected John Edwards as his running mate, shortly before the 2004 Democratic National Convention in Boston, held later that month. Days before Kerry announced Edwards as his running mate, Kerry gave a short list of three candidates: Sen John Edwards, Rep Dick Gephardt, and Gov Tom Vilsack. Heading into the convention, the Kerry/Edwards ticket unveiled their new slogan—a promise to make America \\"stronger at home and more respected in the world.\\" Kerry made his Vietnam War experience the prominent theme of the convention. In accepting the nomination, he began his speech with, \\"I\\\'m John Kerry and I\\\'m reporting for duty.\\" He later delivered what may have been the speech\\\'s most memorable line when he said, \\"the future doesn\\\'t belong to fear, it belongs to freedom\\", a quote that later appeared in a Kerry/Edwards television advertisement.'}


In [None]:
ctt = 0
for i in ques_list:
    query_text = i
    num_results_to_print = 5
    response = run_search(str(query_text), num_results_to_print)
    # print(response)
    top_5_list.append(response)
    ans = ans_list[ctt]
    ctt+=1
    # print(response)
    flag = 0
    for c in response:
        if ans in c['abstract']:
            top_list.append(c)
            flag = 1
            break
    if(flag==0):
        top_list.append(response[0]['abstract'])       
    

# RUN THE SEARCH


In [45]:
print(len(top_5_list))

30045


In [47]:
print(len(top_list))

13858
