In [1]:
#%pip install -r requirements.txt

In [2]:
import pandas as pd
import json
import pickle
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer
import multiprocessing as mp
import os 
import math
from functools import partial

DATA_DIR = 'data'


stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

# Check if documents.pkl exists:
if os.path.isfile(f'{DATA_DIR}/documents.pkl'):
    print('Loading tokenized documents from pickle file...')
    # load the tokenized documents from pickle file
    with open(f'{DATA_DIR}/documents.pkl', 'rb') as f:
        documents = pickle.load(f)
else:
    print('File not found. Tokenizing documents...')
    documents = corpus_df['text'].tolist()
    documents = [x.strip() for x in documents]
    # use multiprocessing to speed up the process
    pool = mp.Pool(mp.cpu_count())
    # pass both documents and stemmer as arguments to the tokenize function
    fn = partial(helpers.tokenize, stemmer=stemmer)   
    documents = list(tqdm(pool.imap(fn, documents), total=len(documents))) 
    # save the tokenized documents as pickle file
    with open(f'{DATA_DIR}/documents.pkl', 'wb') as f:
        pickle.dump(documents, f)

Loading tokenized documents from pickle file...


In [3]:
vocabulary = list(set([item for sublist in documents for item in sublist]))
vocabulary.sort()

In [4]:
# Compute the number of documents that contain each word
doc_freqs = {}
for doc in tqdm(documents):
    for word in set(doc):
        doc_freqs[word] = doc_freqs.get(word, 0) + 1

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [5]:
# Compute the IDF for each word in the vocabulary
num_docs = len(documents)
#idfs = {word: math.log(num_docs / freq) for word, freq in doc_freqs.items()}
idfs = {word: math.log((num_docs - freq + 0.5)/(freq + 0.5)+1) for word, freq in doc_freqs.items()}

In [6]:
# create a vocabulary dictionary with the index of each word in the vocabulary
vocabulary_dict = {word: i for i, word in tqdm(enumerate(vocabulary))}

0it [00:00, ?it/s]

In [7]:
# find average document length
avg_doc_length = sum([len(doc) for doc in documents]) / len(documents)

In [8]:
# now get the first row of the query matrix
query = queries_df['text'].iloc[0]

In [9]:
# tokenize the query
query = helpers.tokenize(query, stemmer)

In [10]:
scores = []
for doc in tqdm(documents, desc="Scoring documents..."):
    score = helpers.bm25(doc, query, idfs, avg_doc_length, k1=1, b=0.75)
    scores.append(score)

Scoring documents...:   0%|          | 0/1471406 [00:00<?, ?it/s]

In [11]:
# get the documents with the highest scores (top 10)
top_10 = np.argsort(scores)[::-1][:10]

In [12]:
for index in top_10:
    print(f'Document ID: {corpus_df.index.values[index]}')
    print(f'Text: "{corpus_df.iloc[index].text}"')
    print(f'Score: {scores[index]}')
    print()

Document ID: 7306165
Text: "He is honest-but smart as hell.. Meanwhile, President Truman was told of the successful test of the Manhattan Project (atomic bomb) in Alamogordo, New Mexico on Jul 16, 1945. Diary of President Truman of Jul 18, 1945 shows Discussed Manhattan (it is a success).e is honest-but smart as hell.. Meanwhile, President Truman was told of the successful test of the Manhattan Project (atomic bomb) in Alamogordo, New Mexico on Jul 16, 1945. Diary of President Truman of Jul 18, 1945 shows Discussed Manhattan (it is a success)."
Score: 22.80581471839291

Document ID: 0
Text: "The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated."
Score: 19.472276735865158

Document ID: 2036644
Text: "Manhattan Project. 

In [13]:
# Since finding all the scores for each query takes a long time,
# we will use matrix multiplication to find the scores for all the queries at once

k1 = 1
b = 0.75

# first create the document matrix where each row is a document
# and each column is a word
# the value of the cell is idfs[term] * (counts[term] * (k1 + 1) / (counts[term] + k1 * (1 - b + b * len(doc) / avg_doc_length)))
from scipy.sparse import lil_matrix

# check if doc_matrix.pkl exists
if os.path.isfile(f'{DATA_DIR}/doc_matrix.pkl'):
    print('Loading document matrix from pickle file...')
    # load the document matrix from pickle file
    with open(f'{DATA_DIR}/doc_matrix.pkl', 'rb') as f:
        doc_matrix = pickle.load(f)
else:
    # Compute the doc matrix
    doc_matrix = lil_matrix((len(documents), len(vocabulary))) # We use lil_matrix since it is efficient in incremental assignments
    for doc_id, doc in tqdm(enumerate(documents), desc="Computing document matrix"):
        counts, max_count = helpers.count_terms(doc)
        for term, count in counts.items():
            if term in vocabulary_dict:
                term_id = vocabulary_dict[term]
                doc_matrix[doc_id, term_id] = idfs[term] * (counts[term] * (k1 + 1) / (counts[term] + k1 * (1 - b + b * len(doc) / avg_doc_length)))

    # Save the doc matrix as a pickle file
    with open(f'{DATA_DIR}/doc_matrix.pkl', 'wb') as f:
        pickle.dump(doc_matrix, f)

Loading document matrix from pickle file...


In [14]:
doc_matrix

<1471406x1130369 sparse matrix of type '<class 'numpy.float64'>'
	with 40408661 stored elements in List of Lists format>

In [15]:
doc_matrix = doc_matrix.tocsr()

In [16]:
test_data

Unnamed: 0,id,query_id
0,0,300674
1,1,125705
2,2,94798
3,3,9083
4,4,174249
...,...,...
7432,7432,147073
7433,7433,243761
7434,7434,162662
7435,7435,247194


In [17]:
# Now create the query matrix where each row is a query
# and each column is a word
# the value of the cell is 1 if the word is in the query, 0 otherwise
query_matrix = lil_matrix((len(test_data), len(vocabulary)))
# iterate row by row
for index, row in tqdm(test_data.iterrows(), desc="Computing query matrix"):
    # get the query from query_id
    query_id = row['query_id']
    query = queries_df.loc[query_id]
    query = helpers.tokenize(query['text'], stemmer)
    for term in query:
        if term in vocabulary_dict:
            term_id = vocabulary_dict[term]
            query_matrix[index, term_id] = 1

query_matrix = query_matrix.tocsr()

Computing query matrix: 0it [00:00, ?it/s]

In [18]:
# Now we can compute the scores for all the queries at once
scores = query_matrix.dot(doc_matrix.T)
# This gives us a matrix of shape (num_queries, num_documents)
# Each cell contains the score of a document for a query

In [19]:
from scipy.sparse import csr_matrix

# Find the indices of the top 3 elements along each row
k = 10
top_k_indices = np.zeros((scores.shape[0], k), dtype=int)
for i in range(scores.shape[0]):
    row = scores.getrow(i).toarray()[0]
    top_k_indices[i] = np.argpartition(row, -k)[-k:]

In [41]:
# Get the document ids of the top k documents for each query
top_k_doc_ids = []
top_k_scores = []
for index, row in enumerate(top_k_indices):
    top_k_doc_ids.append([corpus_df.index.values[i] for i in row])
    top_k_scores.append([scores[index, i] for i in row])

In [43]:
top_k_scores[0]

[30.41874710329342,
 31.99697828271954,
 31.726511480146954,
 36.30561337354201,
 50.13671280047856,
 52.375477845441964,
 36.905421177995265,
 43.60453799353136,
 50.11667425529357,
 46.69823514899414]

In [40]:
for index, row in test_data.iterrows():
    print(f'QueryID: {row["query_id"]}:')
    print(f'Query: "{queries_df.loc[row["query_id"]]["text"]}"')
    print(f'Top {k} documents:')
    for doc_id in top_k_doc_ids[index]:
        print(f'\t Document ID: {doc_id}')
        print(f'\t Text: "{corpus_df.loc[doc_id]["text"]}"')
        # find row index of doc_id
        row_index = corpus_df.index.get_loc(doc_id)
        print(f'\t Score: {scores[index, row_index]}')
        print()
    break

QueryID: 300674:
Query: "how many years did william bradford serve as governor of plymouth colony?"
Top 10 documents:
	 Document ID: 4501973
	 Text: "(The Mayflower in Plymouth harbour by William Halsall, 1882). On this day in 1620, William Bradford and the Mayflower Pilgirms landed at Plymouth Rock in Plymouth, Massachusetts. The Mayflower transported the first English Pilgrims to America, with 102 passengers."
	 Score: 30.41874710329342

	 Document ID: 3872942
	 Text: "Plymouth Colony was founded by a group of people English separatists who later came to be known as the Pilgrims. The core group (roughly 40% of the adults and 56% of the family groupings) was part of a Congregationalist congregation led by William Bradford."
	 Score: 31.99697828271954

	 Document ID: 3305011
	 Text: "In addition, they settled as families for the most part, unique in Atlantic coast settlement at this point. Here we read from the journal of the colony's longtime governor, William Bradford, of the colonis