In [5]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial

DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [6]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [7]:
documents = corpus_df['text'].tolist()
queries = queries_df['text'].tolist()

In [8]:
from transformers import AutoModel, AutoTokenizer

model_name = "bert-base-uncased"  # You can choose a specific BERT variant
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [10]:
document_tokens = [tokenizer(doc, padding=True, truncation=True, return_tensors="pt") for doc in tqdm(documents)]

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [18]:
document_embeddings = [model(**doc).last_hidden_state.mean(dim=1) for doc in tqdm(document_tokens)]

  0%|          | 0/1471406 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# save document embeddings as a pickle file
import pickle
with open(f'{DATA_DIR}/document_bert_embeddings.pkl', 'wb') as f:
    pickle.dump(document_embeddings, f)

In [None]:
# get the first query
query = queries[0]
query_tokens = tokenizer(query, padding=True, truncation=True, return_tensors="pt")

In [None]:
# embed query
query_embedding = model(**query_tokens).last_hidden_state.mean(dim=1)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(query_embedding, document_embeddings).flatten()
top_indices = similarities.argsort()[::-1][:10]

for index in top_indices:
    print(f'Document id: {corpus_df.iloc[index].name}')
    print(f'Document text: {corpus_df.iloc[index].text}')
    print(f'Similarity: {similarities[index]}')
    print('______________________________')
    print()