In [20]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial
from nltk import download as nltk_download
from gensim.parsing.preprocessing import preprocess_string
import pickle

nltk_download('punkt')
nltk_download('stopwords')

DATA_DIR = 'data'


stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

[nltk_data] Downloading package punkt to /home/aybars/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/aybars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [3]:
original_documents = corpus_df['text'].tolist()

In [4]:
# Check if a file for preprocessed documents exists
if os.path.isfile(f'{DATA_DIR}/preprocessed_documents.txt'):
    # if it exists, read the preprocessed documents from the file
    with open(f'{DATA_DIR}/preprocessed_documents.txt', 'r') as f:
        preprocessed_documents = f.readlines()
else:
    documents = corpus_df.text.values.tolist()
    print(documents[:5])
    preprocessed_documents = [preprocess_string(document) for document in tqdm(documents, desc='Preprocessing documents', total=len(documents))]
    # use multiprocessing to speed up the process
    # pool = mp.Pool(mp.cpu_count())
    # pass both documents and stemmer as arguments to the tokenize function
    # fn = partial(helpers.tokenize, stemmer=stemmer)   
    # preprocessed_documents = list(tqdm(pool.imap(fn, documents), total=len(documents))) 

    # write preprocessed documents to a txt file
    with open(f'{DATA_DIR}/preprocessed_documents.txt', 'w') as f:
        # join the array of tokens to a string and write it to the file
        f.writelines([' '.join(document) + '\n' for document in preprocessed_documents])

In [5]:
print(preprocessed_documents[0].strip())

invent cotton gin cotton americaâ lead crop cotton king america produc pound cotton product increas pound


In [6]:
import fasttext
# epoch parameter is by default set to 5
print(f'Using {mp.cpu_count()} cores')
model = fasttext.train_unsupervised(f'{DATA_DIR}/preprocessed_documents.txt', model = 'skipgram', thread=mp.cpu_count(), verbose=2, ws=5, dim=450, epoch=10, minCount=1)

Using 18 cores


In [7]:
# save model
model.save_model(f'{DATA_DIR}/fasttext_model_skipgram_ws5.bin')

In [8]:
# load the model
# model = fasttext.load_model(f'{DATA_DIR}/fasttext_model_cbow_ws10.bin')

In [9]:
vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])
# Create a dictionary of vectors for easier search
vector_dict = dict(zip(vocabulary, word_embeddings))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Compute the number of documents that contain each word
doc_freqs = {}
for doc in tqdm(preprocessed_documents):
    for word in set(fasttext.tokenize(doc.strip())):
        doc_freqs[word] = doc_freqs.get(word, 0) + 1

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [11]:
num_docs = len(preprocessed_documents)
# idfs = {word: math.log(num_docs / freq) for word, freq in doc_freqs.items()}
idfs = {word: math.log((num_docs - freq + 0.5)/(freq + 0.5)+1) for word, freq in doc_freqs.items()}

In [14]:
from collections import Counter , defaultdict

def aggregate_vector_list(vlist, aggfunc, weights=None, log = False):
    if log: 
        print(weights)
    if aggfunc == 'idf':
        return np.average(vlist, axis=0, weights=weights)
    elif aggfunc == 'mean':
        return np.array(vlist).mean(axis=0)
    else:
        return np.zeros(np.array(vlist).shape[1])

In [15]:
possible_aggfuncs = ["idf", "mean", "inverse_count"]

aggregated_doc_vectors = {}

# Aggregate vectors of documents beforehand
for aggfunc in tqdm(possible_aggfuncs):
    aggregated_doc_vectors[aggfunc] = np.zeros((len(preprocessed_documents), word_embeddings.shape[1]))
    for index, doc in tqdm(enumerate(preprocessed_documents), total=len(preprocessed_documents)):
        tokenized_doc = fasttext.tokenize(doc.strip())
        vlist = [vector_dict[token] for token in tokenized_doc if token in vector_dict]
        if aggfunc == 'idf':
            weights = [idfs.get(word, 0) for word in tokenized_doc if word in vector_dict]
        elif aggfunc == 'inverse_count':
            counts = Counter(tokenized_doc)
            weights = [1 / counts[word] for word in tokenized_doc if word in vector_dict]
        if(len(vlist) < 1):
            continue 
        else:
            aggregated_doc_vectors[aggfunc][index] = aggregate_vector_list(vlist, aggfunc, weights) 

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1471406 [00:00<?, ?it/s]

  0%|          | 0/1471406 [00:00<?, ?it/s]

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
def aggregate_query(query, aggfunc, tokenized = False):
    if not tokenized:
        tokens = fasttext.tokenize(query.strip())
    else:
        tokens = query
    if aggfunc == 'idf':
        weights = [idfs.get(word, 0) for word in tokens if word in vector_dict]
        if len(tokens) == 1:
            print('got len 1')
            for word in tokens:
                print(word, idfs.get(word, 0), word in vector_dict)
        print('in aggregate', tokens, weights)
        return aggregate_vector_list([vector_dict[token] for token in tokens if token in vector_dict], aggfunc, weights)
    elif aggfunc == 'inverse_count':
        counts = Counter(tokens)
        weights = [1 / counts[word] for word in tokens if word in vector_dict]
        return aggregate_vector_list([vector_dict[token] for token in tokens if token in vector_dict], aggfunc, weights)
    else:
        return aggregate_vector_list([vector_dict[token] for token in tokens if token in vector_dict], aggfunc)
    
def get_most_similar_documents(query_vector, aggfunc, k = 5):
    query_vector = query_vector.reshape(1, -1)
    # Calculate the similarity with each vector. 
    # Hint: Cosine similarity function takes a matrix as input so you do not need to loop through each document vector.
    sim = cosine_similarity(query_vector, aggregated_doc_vectors[aggfunc])
    # Rank the document vectors according to their cosine similarity with 
    indexes = np.argsort(sim, axis=-1, kind='quicksort', order=None) # This is sorted in ascending order
    indexes = indexes[0]
    indexes = indexes[::-1] # Convert to descending
    return indexes

def search_vec_embeddings(query, topk = 10, aggfunc = 'mean', log=True):
    query_vector = aggregate_query(query, aggfunc)
    #print("Query vector shape: ", query_vector.shape)
    indexes = get_most_similar_documents(query_vector, aggfunc)
    # Print the top k documents
    indexes = indexes[0:topk]
    if log:
        for index in indexes:
            print(f'Document id: {corpus_df.iloc[index].name}')
            print(original_documents[index])
            print()
    return indexes

In [23]:
# preprocess all queries in the queries_df
queries_df['text'] = queries_df['text'].apply(preprocess_string)

# write preprocessed queries to a pickle file
with open(f'{DATA_DIR}/preprocessed_queries.pickle', 'wb') as f:
    pickle.dump(queries_df, f)

In [24]:
queries_df.head()

Unnamed: 0,text
1185869,"[immedi, impact, success, manhattan, project]"
1185868,"[justic, design, repair, harm, victim, commun,..."
597651,"[color, amber, urin]"
403613,"[autoimmun, hepat, bile, acid, synthesi, disord]"
1183785,"[elegxo, mean]"


In [27]:
# iterate over all the querys in test data

task1_matrix = np.zeros((len(test_data), word_embeddings.shape[1]))

for index, row in tqdm(test_data.iterrows(), total=len(test_data)):
    query = queries_df.loc[row['query_id']]['text']
    print('got query', query, index)
    query_vector = aggregate_query(query, 'idf', True)
    print(query, len(query_vector))
    task1_matrix[index] = query_vector

  0%|          | 0/7437 [00:00<?, ?it/s]

got query ['year', 'william', 'bradford', 'serv', 'governor', 'plymouth', 'coloni'] 0
in aggregate ['year', 'william', 'bradford', 'serv', 'governor', 'plymouth', 'coloni'] [2.429540871252117, 5.494991050543403, 8.915991059501739, 4.110663543402782, 6.446176629431923, 8.229193106620452, 5.9551647703126305]
['year', 'william', 'bradford', 'serv', 'governor', 'plymouth', 'coloni'] 300
got query ['defin', 'prevent'] 1
in aggregate ['defin', 'prevent'] [4.330068211973239, 4.366066003435887]
['defin', 'prevent'] 300
got query ['color', 'overlai', 'photoshop'] 2
in aggregate ['color', 'overlai', 'photoshop'] [4.197153185999882, 7.972232932926972, 8.438107168830696]
['color', 'overlai', 'photoshop'] 300
got query ['consid', 'father', 'modern', 'medicin'] 3
in aggregate ['consid', 'father', 'modern', 'medicin'] [3.9722712301931065, 5.292561934501012, 5.051723310906431, 4.790532230131762]
['consid', 'father', 'modern', 'medicin'] 300
got query ['xpress', 'bet', 'charg', 'deposit', 'monei', 'acc

ZeroDivisionError: Weights sum to zero, can't be normalized