In [1]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial
from sentence_transformers import SentenceTransformer

DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [2]:
documents = corpus_df['text'].tolist()
queries = queries_df['text'].tolist()

In [3]:
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [4]:
# check if bert_document_embeddings.pkl file exists
import pickle
if os.path.isfile(f'{DATA_DIR}/bert_document_embeddings.pkl'):
    document_embeddings = pickle.load(f'{DATA_DIR}/bert_document_embeddings.pkl')
else:
    document_embeddings = model.encode(documents, show_progress_bar=True)
    # write document embeddings to file
    with open(f'{DATA_DIR}/bert_document_embeddings.pkl', 'wb') as f:
        pickle.dump(document_embeddings, f)

Batches:   0%|          | 0/45982 [00:00<?, ?it/s]

TypeError: 'str' object cannot be interpreted as an integer

In [6]:
# get the first query
query_embedding = model.encode(queries[0], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
query_embedding.shape

(768,)

In [29]:
from sentence_transformers import util
from sklearn.metrics.pairwise import cosine_similarity
import torch
# compute cosine similarity
# query_embedding to tensor
query_embedding = torch.tensor(query_embedding)
query_embedding = query_embedding.to('cuda')
cos_scores = util.cos_sim(query_embedding.reshape(1,-1), document_embeddings)

In [30]:
document_embeddings.shape

torch.Size([1471406, 768])

In [31]:
import torch
# get the top 10 documents
top_results = torch.topk(cos_scores, k=10)

In [32]:
for top_k_id in top_results[1][0]:
    top_k_id = top_k_id.item()
    document_id = corpus_df.iloc[top_k_id].name
    print(f'Document id {document_id}')
    print(f'{corpus_df.loc[document_id].text}')
    print(f'Similarity score: {cos_scores[0][top_k_id]}')
    print('________________________')
    print()

Document id 0
The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.
Similarity score: 0.6887286901473999
________________________

Document id 2422197
President Harry Truman learns the full details of the Manhattan Project, in which scientists are attempting to create the first atomic bomb, on this day in 1945. The information thrust upon Truman a momentous decision: whether or not to use the worldâs first weapon of mass destruction.
Similarity score: 0.6863462924957275
________________________

Document id 2681219
President Harry Truman learns the full details of the Manhattan Project, in which scientists are attempting to create the first atomic bomb, on this day in 1945.The information thrust upon Truman a momento

In [25]:
task1_matrix = np.zeros((len(test_data), document_embeddings.shape[1]))

for index, row in tqdm(test_data.iterrows(), total=len(test_data)):
    query = queries_df.loc[row['query_id']]['text']
    query_vector = model.encode(query, show_progress_bar=False, device='cuda', convert_to_tensor=True)
    task1_matrix[index] = query_vector

  0%|          | 0/7437 [00:00<?, ?it/s]

In [26]:
# convert document_embeddings & task1 matrix to torch tensors
document_embeddings = torch.tensor(document_embeddings)
task1_matrix = torch.tensor(task1_matrix)

In [33]:
document_embeddings = document_embeddings.to('cuda')
task1_matrix = task1_matrix.to('cuda')

# convert both matrices to float
document_embeddings = document_embeddings.float()
task1_matrix = task1_matrix.float()

task1_results = util.semantic_search(query_embeddings=task1_matrix, corpus_embeddings=document_embeddings, top_k=10)

In [54]:
task1_results_final = []
for results in tqdm(task1_results):
    temp = []
    for result in results:
        document_id = corpus_df.iloc[result['corpus_id']].name
        temp.append(document_id)
    task1_results_final.append(temp)

  0%|          | 0/7437 [00:00<?, ?it/s]

In [55]:
task1_results_final[0]

[7067032,
 4107182,
 2495755,
 3289525,
 4381656,
 7067034,
 3305011,
 793633,
 689657,
 3557087]

In [60]:
# load task2 test data
test_data2 = pd.read_csv(f'{DATA_DIR}/task2_test.tsv', delimiter='\t')
test_data2['corpus-id'] = test_data2['corpus-id'].apply(lambda x: eval(x))

In [57]:
from sentence_transformers import CrossEncoder

# We use a cross-encoder, to re-rank
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [85]:
# iterate row by row
task2_results = []
for index, row in tqdm(test_data2.iterrows(), total=len(test_data2)):
    query = queries_df.loc[row['query-id']]['text']
    question_embedding = model.encode(query, show_progress_bar=False, device='cuda', convert_to_tensor=True)
    question_embedding = question_embedding.cuda()

    hits = row['corpus-id']

    cross_inp = [[query, corpus_df.loc[hit].text] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)
    # make all the scores positive
    cross_scores = [math.exp(score) for score in cross_scores]

    task2_results.append(cross_scores)

  0%|          | 0/33 [00:00<?, ?it/s]

In [87]:
# create a csv file for submission
# HEADER: id,corpus-id,score
# task1 results will be: query-id,[corpus-id1, corpus-id2, ...] (top 10 corpus-ids), -1
# task2 results will be query-id, -1, [score1, score2...] 
# create the file

with open(f'{DATA_DIR}/sentence-transformers_submission.csv', 'w') as f:
    id = 0
    f.writelines('id,corpus-id,score\n')
    for i, row in enumerate(task1_results_final):
        to_write = "\"["
        for j, corpus_id in enumerate(row):
            if j != len(row)-1:
                to_write += str(corpus_id) + ", "
            else:
                to_write += str(corpus_id)
        to_write += "]\""
        f.write(str(id) + "," + to_write + ",-1\n")
        id += 1

    for i,row in enumerate(task2_results):
        query_id_to_write = test_data2.iloc[i]['query-id']
        to_write = "\"["
        for j, score in enumerate(row):
            if j != len(row)-1:
                to_write += str(score) + ", "
            else:
                to_write += str(score) 
        to_write += "]\""
        f.write(str(id) + ",-1," + to_write + "\n")
        id += 1