In [1]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial
from sentence_transformers import SentenceTransformer

DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [2]:
documents = corpus_df['text'].tolist()
queries = queries_df['text'].tolist()

In [3]:
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

In [4]:
# check if bert_document_embeddings.pkl file exists
import torch
if os.path.isfile(f'{DATA_DIR}/bert_document_embeddings.pt'):
    document_embeddings = torch.load(f'{DATA_DIR}/bert_document_embeddings.pt')
else:
    document_embeddings = model.encode(documents, show_progress_bar=True)
    # write document embeddings to file
    torch.save(document_embeddings, f'{DATA_DIR}/bert_document_embeddings.pt')

In [5]:
document_embeddings.to('cuda')

tensor([[-0.0343,  0.0368,  0.0100,  ..., -0.0328, -0.0123, -0.0185],
        [-0.0350, -0.0376, -0.0377,  ..., -0.0182, -0.0011,  0.0298],
        [ 0.0097, -0.0118, -0.0038,  ...,  0.0081,  0.0623,  0.0534],
        ...,
        [-0.0496, -0.0663, -0.0232,  ...,  0.0189, -0.0258, -0.0463],
        [-0.0100, -0.0372, -0.0377,  ...,  0.0322,  0.0201, -0.0065],
        [ 0.0279, -0.0921,  0.0161,  ..., -0.0238,  0.0662,  0.0137]],
       device='cuda:0')

In [6]:
# task1_matrix = np.zeros((len(test_data), document_embeddings.shape[1]))
# create tensor of shape (len(test_data), document_embeddings.shape[1])
task1_matrix = torch.zeros((len(test_data), document_embeddings.shape[1])).to('cuda')
for index, row in tqdm(test_data.iterrows(), total=len(test_data)):
    query = queries_df.loc[row['query_id']]['text']
    query_vector = model.encode(query, show_progress_bar=False, device='cuda', convert_to_tensor=True)
    task1_matrix[index] = query_vector

  0%|          | 0/7437 [00:00<?, ?it/s]

In [7]:
from sentence_transformers import util

# document_embeddings = document_embeddings.to('cuda')
# task1_matrix = task1_matrix.to('cuda')

# # convert both matrices to float
# document_embeddings = document_embeddings.float()
# task1_matrix = task1_matrix.float()

task1_results = util.semantic_search(query_embeddings=task1_matrix, corpus_embeddings=document_embeddings, top_k=10)

In [8]:
task1_results_final = []
for results in tqdm(task1_results):
    temp = []
    for result in results:
        document_id = corpus_df.iloc[result['corpus_id']].name
        temp.append(document_id)
    task1_results_final.append(temp)

  0%|          | 0/7437 [00:00<?, ?it/s]

In [9]:
task1_results_final[0]

[7067032,
 4107182,
 2495755,
 3289525,
 4381656,
 7067034,
 3305011,
 793633,
 689657,
 3557087]

In [10]:
# load task2 test data
test_data2 = pd.read_csv(f'{DATA_DIR}/task2_test.tsv', delimiter='\t')
test_data2['corpus-id'] = test_data2['corpus-id'].apply(lambda x: eval(x))

In [11]:
# load my training data to train a custom CrossEncoder
my_train_data = pd.read_csv(f'{DATA_DIR}/my_custom_train_data2.csv')

In [12]:
# join train_data2 with corpus_df to get the text of the documents
my_train_data = my_train_data.join(corpus_df, on='corpus-id', how='left', rsuffix='_corpus')
# rename text column to corpus_text
my_train_data = my_train_data.rename(columns={'text': 'corpus_text'})
# join with queries_df to get the text of the queries
my_train_data = my_train_data.join(queries_df, on='query-id', how='left', rsuffix='_query')
# rename text column to query_text
my_train_data = my_train_data.rename(columns={'text': 'query_text'})
my_train_data['score'] = my_train_data['score'].astype('int64')

In [13]:
my_train_data.score.value_counts()

score
0    980
1    306
2    185
3     72
Name: count, dtype: int64

In [14]:
from sentence_transformers import InputExample
# Create trainining examples
training_examples = [
    InputExample(
        texts=[row['query_text'], row['corpus_text']], label=row['score']
    ) for index, row in tqdm(my_train_data.iterrows())
]

0it [00:00, ?it/s]

In [15]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('sentence-transformers/all-mpnet-base-v2', num_labels=1, device='cuda')

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from torch.utils.data import DataLoader
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
# Create a dataloader
train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=32)
num_epochs = 20
# We add an evaluator, which evaluates the performance during training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)
cross_encoder.fit(
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=f'{DATA_DIR}/cross_encoder_model3',
    show_progress_bar=True,
)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

Iteration:   0%|          | 0/49 [00:00<?, ?it/s]

In [17]:
cross_encoder.save(f'{DATA_DIR}/cross_encoder_model3')

In [23]:
evaluator = CECorrelationEvaluator.from_input_examples(examples=training_examples, name='my_custom_test_data2')
evaluator(cross_encoder)

0.8304611841498001

In [18]:
# from sentence_transformers import util

# # iterate row by row
# task2_results = []
# for index, row in tqdm(test_data2.iterrows(), total=len(test_data2)):
#     query = queries_df.loc[row['query-id']]['text']
#     question_embedding = model.encode(query, show_progress_bar=False, device='cuda', convert_to_tensor=True)
#     doc_list = row['corpus-id']
#     # convert doc_list from doc ids to doc indexes
#     doc_list = [corpus_df.index.get_loc(doc_id) for doc_id in doc_list]
#     doc_embeds = document_embeddings[doc_list]
#     # find cosine similarity between question_embedding and doc_embeddings
#     cos_scores = util.cos_sim(question_embedding, doc_embeds)
#     # make all the scores positive and append to task2_results
#     task2_results.append([math.exp(score) for score in cos_scores[0]])

#     # hits = row['corpus-id']

#     # cross_inp = [[query, corpus_df.loc[hit].text] for hit in hits]
#     # cross_scores = cross_encoder.predict(cross_inp)
#     # # make all the scores positive
#     # cross_scores = [math.exp(score) for score in cross_scores]

#     # task2_results.append(cross_scores)

In [19]:
from sentence_transformers import util

# iterate row by row
task2_results = []
for index, row in tqdm(test_data2.iterrows(), total=len(test_data2)):
    query = queries_df.loc[row['query-id']]['text']


    hits = row['corpus-id']

    cross_inp = [[query, corpus_df.loc[hit].text] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)
    # cross_scores is currently of shape len(hits)x4
    # we need to get the label with the highest score
    # cross_scores = np.argmax(cross_scores, axis=1)
    # make all the scores positive
    # cross_scores = [math.exp(score) for score in cross_scores]

    task2_results.append(cross_scores)

  0%|          | 0/33 [00:00<?, ?it/s]

In [20]:
# save task2 results to a pickle file
import pickle
with open(f'{DATA_DIR}/task2_results_custom.pkl', 'wb') as f:
    pickle.dump(task2_results, f)

In [21]:
# create a csv file for submission
# HEADER: id,corpus-id,score
# task1 results will be: query-id,[corpus-id1, corpus-id2, ...] (top 10 corpus-ids), -1
# task2 results will be query-id, -1, [score1, score2...] 
# create the file

with open(f'{DATA_DIR}/sentence-transformers_submission7.csv', 'w') as f:
    id = 0
    f.writelines('id,corpus-id,score\n')
    for i, row in enumerate(task1_results_final):
        to_write = "\"["
        for j, corpus_id in enumerate(row):
            if j != len(row)-1:
                to_write += str(corpus_id) + ", "
            else:
                to_write += str(corpus_id)
        to_write += "]\""
        f.write(str(id) + "," + to_write + ",-1\n")
        id += 1

    for i,row in enumerate(task2_results):
        query_id_to_write = test_data2.iloc[i]['query-id']
        to_write = "\"["
        for j, score in enumerate(row):
            if j != len(row)-1:
                to_write += str(score) + ", "
            else:
                to_write += str(score) 
        to_write += "]\""
        f.write(str(id) + ",-1," + to_write + "\n")
        id += 1