In [None]:
#pip install -r requirements.txt

In [10]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 

DATA_DIR = 'data'


stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [11]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [12]:
# Check if documents.pkl exists:
if os.path.isfile(f'{DATA_DIR}/documents.pkl'):
    print('Loading tokenized documents from pickle file...')
    # load the tokenized documents from pickle file
    import pickle
    with open(f'{DATA_DIR}/documents.pkl', 'rb') as f:
        documents = pickle.load(f)
else:
    print('File not found. Tokenizing documents...')
    documents = corpus_df['text'].tolist()
    documents = [x.strip() for x in documents]
    # use multiprocessing to speed up the process
    pool = mp.Pool(mp.cpu_count())
    documents = list(tqdm(pool.imap(helpers.tokenize, documents, stemmer), total=len(documents)))
    # save the tokenized documents as pickle file
    import pickle
    with open(f'{DATA_DIR}/documents.pkl', 'wb') as f:
        pickle.dump(documents, f)

Loading tokenized documents from pickle file...


In [13]:
vocabulary = list(set([item for sublist in documents for item in sublist]))
vocabulary.sort()

In [14]:
import math

# Compute the number of documents that contain each word
doc_freqs = {}
for doc in tqdm(documents):
    for word in set(doc):
        doc_freqs[word] = doc_freqs.get(word, 0) + 1

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [15]:
# Compute the IDF values for each word in the vocabulary
num_docs = len(documents)
idf = {}
for word in tqdm(vocabulary):
    doc_freq = doc_freqs.get(word, 0)
    idf[word] = math.log(num_docs / (doc_freq + 1))

  0%|          | 0/1130369 [00:00<?, ?it/s]

In [16]:
# Compute the IDF for each word in the vocabulary
num_docs = len(documents)
idfs = {word: math.log(num_docs / freq) for word, freq in doc_freqs.items()}

In [17]:
# create a vocabulary dictionary with the index of each word in the vocabulary
vocabulary_dict = {word: i for i, word in tqdm(enumerate(vocabulary))}

0it [00:00, ?it/s]

In [19]:
from scipy.sparse import lil_matrix

# Compute the term frequency matrix
term_freq_matrix = lil_matrix((len(documents), len(vocabulary)))
for i, doc in tqdm(enumerate(documents), total=len(documents), desc='Computing term frequency matrix'):
    counts, max_count = helpers.count_terms(doc)
    for term, count in counts.items():
        if term in vocabulary_dict:
            term_id = vocabulary_dict[term]
            term_freq_matrix[i, term_id] = count/max_count * idfs[term]

Computing term frequency matrix:   0%|          | 0/1471406 [00:00<?, ?it/s]

In [20]:
term_freq_matrix_csr = term_freq_matrix.tocsr()

In [22]:
# now get the first row of the query matrix
query = queries_df['text'].iloc[0]

In [26]:
# tokenize the query
query = helpers.tokenize(query, stemmer)

In [35]:
# compute the tf-idf of the query
def compute_query_vector(query,vocabulary_dict,idfs):
    query_vec = np.zeros((len(vocabulary)))
    counts, max_count = helpers.count_terms(query)
    for term, count in counts.items():
        if term in vocabulary_dict:
            term_id = vocabulary_dict[term]
            query_vec[term_id] = count/max_count * idfs[term]
    return query_vec

query_vec = compute_query_vector(query, vocabulary_dict, idfs)

In [39]:
# compute the cosine similarity between the query and the documents
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(term_freq_matrix_csr, query_vec.reshape(1,-1))

# get the top 10 documents
top_docs = np.argsort(cosine_similarities, axis=0)[::-1][:10]
top_docs = top_docs.flatten()

In [50]:
for index in top_docs:
    print(f'Document ID: {corpus_df.index.values[index]}')
    print(f'Text: "{corpus_df.iloc[index].text}"')
    print(f'Similarity: {cosine_similarities[index]}')
    print()

Document ID: 3607205
Text: "Manhattan Project. 1  The Manhattan Project was a secret military project created in 1942 to produce the first US nuclear weapon. Fears that Nazi Germany would build and use a nuclear weapon during World War II triggered the start of the Manhattan Project, which was originally based in Manhattan, New York."
Similarity: [0.52951184]

Document ID: 7243450
Text: "The project was given its name due to the fact that at least 10 of the sites used for the research were located in Manhattan. Following is a timeline of the key events related to the development of the atomic bomb and the Manhattan Project. Manhattan Project Timeline"
Similarity: [0.51425655]

Document ID: 2036644
Text: "Manhattan Project. The Manhattan Project was a secret military project created in 1942 to produce the first US nuclear weapon. Fears that Nazi Germany would build and use a nuclear weapon during World War II triggered the start of the Manhattan Project, which was originally based in Ma

In [48]:
corpus_df.iloc[1214391]

text    Manhattan Project. 1  The Manhattan Project wa...
Name: 3607205, dtype: object

In [49]:
corpus_df.iloc[872801]

text    This article is about the atomic bomb project....
Name: 2148554, dtype: object

In [None]:
[1214391  590437 1116515 1231205  428535  146155 1021116 1248393  513144]
