In [1]:
#%pip install -r requirements.txt

In [2]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math

DATA_DIR = 'data'


stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [3]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [4]:
# import partial
from functools import partial

# Check if documents.pkl exists:
if os.path.isfile(f'{DATA_DIR}/documents.pkl'):
    print('Loading tokenized documents from pickle file...')
    # load the tokenized documents from pickle file
    import pickle
    with open(f'{DATA_DIR}/documents.pkl', 'rb') as f:
        documents = pickle.load(f)
else:
    print('File not found. Tokenizing documents...')
    documents = corpus_df['text'].tolist()
    documents = [x.strip() for x in documents]
    # use multiprocessing to speed up the process
    pool = mp.Pool(mp.cpu_count())
    # pass both documents and stemmer as arguments to the tokenize function
    fn = partial(helpers.tokenize, stemmer=stemmer)   
    documents = list(tqdm(pool.imap(fn, documents), total=len(documents))) 
    # save the tokenized documents as pickle file
    import pickle
    with open(f'{DATA_DIR}/documents.pkl', 'wb') as f:
        pickle.dump(documents, f)

Loading tokenized documents from pickle file...


In [5]:
vocabulary = list(set([item for sublist in documents for item in sublist]))
vocabulary.sort()

In [6]:
# Compute the number of documents that contain each word
doc_freqs = {}
for doc in tqdm(documents):
    for word in set(doc):
        doc_freqs[word] = doc_freqs.get(word, 0) + 1

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [7]:
# Compute the IDF values for each word in the vocabulary
# num_docs = len(documents)
# idf = {}
# for word in tqdm(vocabulary):
#     doc_freq = doc_freqs.get(word, 0)
#     idf[word] = math.log(num_docs / (doc_freq + 1))

In [8]:
# Compute the IDF for each word in the vocabulary
num_docs = len(documents)
idfs = {word: math.log(num_docs / freq) for word, freq in doc_freqs.items()}

In [9]:
# create a vocabulary dictionary with the index of each word in the vocabulary
vocabulary_dict = {word: i for i, word in tqdm(enumerate(vocabulary))}

0it [00:00, ?it/s]

In [10]:
from scipy.sparse import lil_matrix

# Compute the term frequency matrix
term_freq_matrix = lil_matrix((len(documents), len(vocabulary))) # We use lil_matrix since it is efficient in incremental assignments
for i, doc in tqdm(enumerate(documents), total=len(documents), desc='Computing term frequency matrix'):
    counts, max_count = helpers.count_terms(doc)
    for term, count in counts.items():
        if term in vocabulary_dict:
            term_id = vocabulary_dict[term]
            term_freq_matrix[i, term_id] = count/max_count * idfs[term]

Computing term frequency matrix:   0%|          | 0/1471406 [00:00<?, ?it/s]

In [11]:
# Since the lil_matrix is inefficient for row slicing/matrix multiplication, convert it to csr_matrix
term_freq_matrix_csr = term_freq_matrix.tocsr()
term_freq_matrix_csr

<1471406x1130369 sparse matrix of type '<class 'numpy.float64'>'
	with 40408661 stored elements in Compressed Sparse Row format>

In [12]:
# now get the first row of the query matrix
query = queries_df['text'].iloc[0]

In [13]:
# tokenize the query
query = helpers.tokenize(query, stemmer)

In [14]:
# compute the tf-idf of the query
query_vec = helpers.compute_query_vector(query, vocabulary_dict, idfs)

In [15]:
# compute the cosine similarity between the query and the documents
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(term_freq_matrix_csr, query_vec.reshape(1,-1))

# get the top 10 documents
top_docs = np.argsort(cosine_similarities, axis=0)[::-1][:10]
top_docs = top_docs.flatten()

In [16]:
len(cosine_similarities)

1471406

In [17]:
for index in top_docs:
    print(f'Document ID: {corpus_df.index.values[index]}')
    print(f'Text: "{corpus_df.iloc[index].text}"')
    print(f'Similarity: {cosine_similarities[index]}')
    print()

Document ID: 3607205
Text: "Manhattan Project. 1  The Manhattan Project was a secret military project created in 1942 to produce the first US nuclear weapon. Fears that Nazi Germany would build and use a nuclear weapon during World War II triggered the start of the Manhattan Project, which was originally based in Manhattan, New York."
Similarity: [0.52951184]

Document ID: 7243450
Text: "The project was given its name due to the fact that at least 10 of the sites used for the research were located in Manhattan. Following is a timeline of the key events related to the development of the atomic bomb and the Manhattan Project. Manhattan Project Timeline"
Similarity: [0.51425655]

Document ID: 2036644
Text: "Manhattan Project. The Manhattan Project was a secret military project created in 1942 to produce the first US nuclear weapon. Fears that Nazi Germany would build and use a nuclear weapon during World War II triggered the start of the Manhattan Project, which was originally based in Ma

In [18]:
# do it for each query in test data
# create a list to store the results
results = []

# iterate over each query in the test data
for row in tqdm(test_data.iterrows(), total=len(test_data), desc='Processing test data'):
    query_id = row[1]["query_id"]
    query = queries_df.loc[query_id]['text']
    query = helpers.tokenize(query, stemmer)
    query_vec = helpers.compute_query_vector(query, vocabulary_dict, idfs)
    cosine_similarities = cosine_similarity(term_freq_matrix_csr, query_vec.reshape(1,-1))
    top_docs = np.argsort(cosine_similarities, axis=0)[::-1][:10]
    top_docs = top_docs.flatten()

    # get document_ids
    document_ids = corpus_df.index.values[top_docs]
    results.append(document_ids)
    break



Processing test data:   0%|          | 0/7437 [00:00<?, ?it/s]

In [24]:
# Create a matrix for queries
query_matrix = lil_matrix((len(test_data), len(vocabulary)))

# iterate over each query in the test data
for index, row in tqdm(test_data.iterrows(), desc="Computing query matrix", total=len(test_data)):
    # get the query from query_id
    query_id = row['query_id']
    query = queries_df.loc[query_id]
    query = helpers.tokenize(query['text'], stemmer)
    for term in query:
        if term in vocabulary_dict:
            term_id = vocabulary_dict[term]
            query_vec[term_id] = count/max_count * idfs[term]


Computing query matrix:   0%|          | 0/7437 [00:00<?, ?it/s]

In [27]:
# compute the cosine similarity between the query and the documents
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(query_matrix.tocsr(), term_freq_matrix_csr)

In [29]:
# for each query, get the top 10 documents
# top_docs = np.argsort(cosine_similarities, axis=1)[:,::-1][:,:10]
# Doing this way uses too much memory