In [1]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial

DATA_DIR = 'data'


stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [2]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [3]:
original_documents = corpus_df['text'].tolist()

In [4]:
# Check if a file for preprocessed documents exists
if os.path.isfile(f'{DATA_DIR}/preprocessed_documents.txt'):
    # if it exists, read the preprocessed documents from the file
    with open(f'{DATA_DIR}/preprocessed_documents.txt', 'r') as f:
        preprocessed_documents = f.readlines()
else:
    documents = corpus_df['text'].tolist()
    documents = [x.strip() for x in documents]
    # use multiprocessing to speed up the process
    pool = mp.Pool(mp.cpu_count())
    # pass both documents and stemmer as arguments to the tokenize function
    fn = partial(helpers.tokenize, stemmer=stemmer)   
    preprocessed_documents = list(tqdm(pool.imap(fn, documents), total=len(documents))) 

    # write preprocessed documents to a txt file
    with open(f'{DATA_DIR}/preprocessed_documents.txt', 'w') as f:
        for item in preprocessed_documents:
            f.write("%s\n" % item)

In [5]:
# Check if a file for preprocessed queries already exists
if os.path.exists(f'{DATA_DIR}/preprocessed_queries.txt'):
    # If it exists, load the preprocessed queries from the file
    with open(f'{DATA_DIR}/preprocessed_queries.txt', 'r') as f:
        preprocessed_queries = f.readlines()
else:
    # preprocess queries in the same way as documents
    queries = queries_df['text'].tolist()
    queries = [x.strip() for x in queries]
    # use multiprocessing to speed up the process
    pool = mp.Pool(mp.cpu_count())
    # pass both queries and stemmer as arguments to the tokenize function
    fn = partial(helpers.tokenize, stemmer=stemmer)
    preprocessed_queries = list(tqdm(pool.imap(fn, queries), total=len(queries)))

    # write preprocessed queries to a txt file
    with open(f'{DATA_DIR}/preprocessed_queries.txt', 'w') as f:
        for item in preprocessed_queries:
            f.write("%s\n" % item)

In [6]:
import fasttext
model = fasttext.train_unsupervised(f'{DATA_DIR}/preprocessed_documents.txt', model = 'skipgram')

Read 54M words
Number of words:  150831
Number of labels: 0
Progress: 100.0% words/sec/thread:   75585 lr:  0.000000 avg.loss:  0.759930 ETA:   0h 0m 0s 45.2% words/sec/thread:   74589 lr:  0.027423 avg.loss:  1.287198 ETA:   0h 3m42s


In [None]:
# save model
model.save_model(f'{DATA_DIR}/fasttext_model.bin')

In [7]:
vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])

In [8]:
# Create a dictionary of vectors for easier search
vector_dict = dict(zip(vocabulary, word_embeddings))

def aggregate_vector_list(vlist, aggfunc):
    if aggfunc == 'max':
        return np.array(vlist).max(axis=0)
    elif aggfunc == 'min':
        return np.array(vlist).min(axis=0)
    elif aggfunc == 'mean':
        return np.array(vlist).mean(axis=0)
    else:
        return np.zeros(np.array(vlist).shape[1])

possible_aggfuncs = ["max", "min", "mean"]

aggregated_doc_vectors = {}

# Aggregate vectors of documents beforehand
for aggfunc in tqdm(possible_aggfuncs):
    aggregated_doc_vectors[aggfunc] = np.zeros((len(preprocessed_documents), word_embeddings.shape[1]))
    for index, doc in tqdm(enumerate(preprocessed_documents), total=len(preprocessed_documents)):
        vlist = [vector_dict[token] for token in fasttext.tokenize(doc) if token in vector_dict]
        if(len(vlist) < 1):
            continue 
        else:
            aggregated_doc_vectors[aggfunc][index] = aggregate_vector_list(vlist, aggfunc) 

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1471406 [00:00<?, ?it/s]

  0%|          | 0/1471406 [00:00<?, ?it/s]

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [9]:
aggregated_doc_vectors['mean'].shape

(1471406, 100)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
def aggregate_query(query, aggfunc):
    tokens = fasttext.tokenize(query)
    if(len(tokens) == 1):
        if(tokens[0] in vocabulary):
            return vector_dict[tokens[0]]
    elif(len(tokens) > 1):
        vlist = []
        print('tokens are ', tokens)
        for token in tokens:
            if (token in vocabulary):
                vlist.append(vector_dict[token])
        return aggregate_vector_list(vlist, aggfunc)
    else:
        print("%s is not in the vocabulary." % (query))
    
def get_most_similar_documents(query_vector, aggfunc, k = 5):
    query_vector = query_vector.reshape(1, -1)
    # Calculate the similarity with each vector. 
    # Hint: Cosine similarity function takes a matrix as input so you do not need to loop through each document vector.
    sim = cosine_similarity(query_vector, aggregated_doc_vectors[aggfunc])
    print(sim.shape)
    # Rank the document vectors according to their cosine similarity with 
    indexes = np.argsort(sim, axis=-1, kind='quicksort', order=None) # This is sorted in ascending order
    indexes = indexes[0]
    indexes = indexes[::-1] # Convert to descending
    return indexes

def search_vec_embeddings(query, topk = 10, aggfunc = 'mean'):
    query_vector = aggregate_query(query, aggfunc)
    print("Query vector shape: ", query_vector.shape)
    indexes = get_most_similar_documents(query_vector, aggfunc)
    # Print the top k documents
    indexes = indexes[0:topk]
    for index in indexes:
        print(f'Document id: {corpus_df.iloc[index].name}')
        print(original_documents[index])
        print()

In [21]:
# get the first query
query = queries_df.iloc[0]['text']
query

')what was the immediate impact of the success of the manhattan project?'

In [22]:
query = helpers.tokenize(query, stemmer=stemmer)
query

'immedi impact success manhattan project'

In [24]:
search_vec_embeddings(query, aggfunc = 'max')

tokens are  ['immedi', 'impact', 'success', 'manhattan', 'project']
Query vector shape:  (100,)
(1, 1471406)
Document id: 33625
Maryann Johnson, a real-estate agent in Manhattan, starts her days with a motivational Instagram post. â

Document id: 3027617
Upper Manhattan is a large and fascinating place where the identity and characteristics of the neighborhoods change almost every few blocks. Harlem itself consists of several neighborhoods each with its own distinct culture and history.

Document id: 7397017
Over on Reddit, a user named movielover278 posted a picture he created that shows a size comparison between a Super Star Destroyer from the Star Wars franchise Over on Reddit, a user named movielover278 posted a picture he created that shows a size comparison between a Super Star Destroyer from the Star Wars franchise and Manhattan, New York.

Document id: 5079447
One of New York Cityâs most famous landmarks is the Wall Street Bull, located at Bowling Green Park in Lower Manhat