In [None]:
# %pip install pandas scikit-learn matplotlib seaborn jsonlines tqdm

In [1]:
import pandas as pd
import json

DATA_DIR = 'data'

# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [2]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [3]:
# Apply vectorizer to the corpus
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
corpus_vectors = vectorizer.fit_transform(corpus_df['text'])

In [4]:
# Below 2 cells are just for visualization and double-checking purposes
corpus_vectors_df = pd.DataFrame.sparse.from_spmatrix(corpus_vectors, index=corpus_df.index, columns=vectorizer.get_feature_names_out())

In [5]:
# print the columns that are larger than 0 for the first row
display(corpus_vectors_df.loc[0, corpus_vectors_df.loc[0] > 0])
# Also print the text for the first row
display(corpus_df.loc[0].text)

# Do all the non-zero values of the first row exist in the text?
print(all([word in corpus_df.loc[0].text.lower() for word in corpus_vectors_df.loc[0, corpus_vectors_df.loc[0] > 0].index]))

achievement      0.196173
amid             0.219827
atomic           0.176855
cloud            0.171244
communication    0.161206
engineers        0.183345
equally          0.185823
hanging          0.192908
hundreds         0.169618
important        0.113235
impressive       0.192228
innocent         0.215891
intellect        0.238027
lives            0.150731
manhattan        0.189711
meant            0.170209
minds            0.204465
obliterated      0.279503
presence         0.155866
project          0.141383
researchers      0.169291
scientific       0.319888
success          0.317699
thousands        0.156374
truly            0.176543
Name: 0, dtype: Sparse[float64, 0]

'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'

True


In [28]:
display(corpus_vectors_df.loc[5117689, corpus_vectors_df.loc[5117689] > 0])

additional      0.137694
berkeley        0.220726
chicago         0.163381
conducted       0.179115
district        0.152207
engineer        0.181889
laboratories    0.207140
locations       0.155175
major           0.124650
manhattan       0.579110
mexico          0.154124
new             0.091710
operations      0.323544
place           0.112342
project         0.287723
remote          0.177976
research        0.133311
site            0.133766
states          0.099844
tennessee       0.171234
took            0.147568
university      0.133413
washington      0.144723
Name: 5117689, dtype: Sparse[float64, 0]

In [10]:
from sklearn.metrics.pairwise import linear_kernel

# Now for a given query, we can compute the cosine similarity between the query and all the documents
# Then return the top 10 documents that are most similar to the query

# For example, let's say we want to find the most similar documents to the first query in train data
query_id = train_data['query_id'][0]
# Get the query text
query = queries_df['text'][query_id]
print(f'Predicting for query: "{queries_data[query_id]}"')

# Vectorize the query
query_vector = vectorizer.transform([query])

# Compute the cosine similarity between the query and all the documents
cosine_similarities = linear_kernel(query_vector, corpus_vectors).flatten()

# Get the top 10 most similar documents
related_docs_indices = cosine_similarities.argsort()[:-10:-1]
print(related_docs_indices)
for index in related_docs_indices:
    print(f'Document ID: {corpus_df.index.values[index]}')
    print(f'Text: "{corpus_df.iloc[index].text}"')
    print(f'Similarity: {cosine_similarities[index]}')
    print()

Predicting for query: ")what was the immediate impact of the success of the manhattan project?" )what was the immediate impact of the success of the manhattan project? 1185869
[1214391  590437 1116515 1231205  428535  146155 1021116 1248393  513144]
Document ID: 3607205
Text: "Manhattan Project. 1  The Manhattan Project was a secret military project created in 1942 to produce the first US nuclear weapon. Fears that Nazi Germany would build and use a nuclear weapon during World War II triggered the start of the Manhattan Project, which was originally based in Manhattan, New York."
Similarity: 0.4853277771970198

Document ID: 7243450
Text: "The project was given its name due to the fact that at least 10 of the sites used for the research were located in Manhattan. Following is a timeline of the key events related to the development of the atomic bomb and the Manhattan Project. Manhattan Project Timeline"
Similarity: 0.4780633057447168

Document ID: 2036644
Text: "Manhattan Project. The M

In [7]:
grouped_df = train_data.groupby('query_id').agg({'document_id': list, 'score': list}).reset_index()

In [8]:
# Only return rows that have multiple corpus-ids
grouped_df[grouped_df['document_id'].map(len) > 1]

Unnamed: 0,query_id,document_id,score
73,202,"[889051, 153578]","[1, 1]"
126,374,"[4086051, 4086052, 4086057]","[1, 1, 1]"
204,623,"[2890972, 2890975]","[1, 1]"
247,797,"[6906973, 6906977]","[1, 1]"
265,857,"[5529882, 5529883]","[1, 1]"
...,...,...,...
502810,1185631,"[137014, 409549]","[1, 1]"
502822,1185661,"[8338479, 8338480]","[1, 1]"
502844,1185698,"[5044388, 5044391]","[1, 1]"
502847,1185702,"[4620566, 4620568]","[1, 1]"
