In [1]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial
from nltk import download as nltk_download
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
import pickle

nltk_download('punkt')
nltk_download('stopwords')

DATA_DIR = 'data'


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aybarsyazici/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aybarsyazici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}


#train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
#train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
#train_data['document_id'] = train_data['document_id'].astype('int64')
#train_data['query_id'] = train_data['query_id'].astype('int64')

In [3]:
with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})

CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, lambda x: strip_short(s=x,minsize=1), strip_multiple_whitespaces, stem_text]

In [4]:
test_data

Unnamed: 0,id,query_id
0,0,300674
1,1,125705
2,2,94798
3,3,9083
4,4,174249
...,...,...
7432,7432,147073
7433,7433,243761
7434,7434,162662
7435,7435,247194


In [5]:
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, lambda x: strip_short(s=x,minsize=1), strip_multiple_whitespaces, stem_text]

In [6]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [7]:
# Check if a file for preprocessed documents exists
if os.path.isfile(f'{DATA_DIR}/preprocessed_documents_3.txt'):
    # if it exists, read the preprocessed documents from the file
    with open(f'{DATA_DIR}/preprocessed_documents_3.txt', 'r') as f:
        preprocessed_documents = f.readlines()
else:
    documents = corpus_df.text.values.tolist()
    print(documents[:5])
    preprocessed_documents = [preprocess_string(document, CUSTOM_FILTERS) for document in tqdm(documents, desc='Preprocessing documents', total=len(documents))]
    # use multiprocessing to speed up the process
    # pool = mp.Pool(mp.cpu_count())
    # pass both documents and stemmer as arguments to the tokenize function
    # fn = partial(helpers.tokenize, stemmer=stemmer)   
    # preprocessed_documents = list(tqdm(pool.imap(fn, documents), total=len(documents))) 

    # write preprocessed documents to a txt file
    with open(f'{DATA_DIR}/preprocessed_documents_3.txt', 'w') as f:
        # join the array of tokens to a string and write it to the file
        f.writelines([' '.join(document) + '\n' for document in preprocessed_documents])

In [8]:
synonyms = {
    'trivora': ['ethinylestradiol', 'levonorgestrel'],
    'rovna': ['rovn\u00c3\u00a1'.lower()],
    'carlomagno': ['charlemagn']
}

In [9]:
import fasttext
# epoch parameter is by default set to 5
# print(f'Using {mp.cpu_count()} cores')
# model = fasttext.train_unsupervised(f'{DATA_DIR}/preprocessed_documents_3.txt', model = 'skipgram', thread=mp.cpu_count(), verbose=2, ws=5, dim=450, epoch=10, minCount=1)

In [10]:
# save model
# model.save_model(f'{DATA_DIR}/fasttext_model_skipgram_ws5_3.bin')

In [11]:
# load the model
model = fasttext.load_model(f'{DATA_DIR}/fasttext_model_skipgram_ws5_3.bin')



In [12]:
vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])
# Create a dictionary of vectors for easier search
vector_dict = dict(zip(vocabulary, word_embeddings))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Compute the number of documents that contain each word
doc_freqs = {}
for doc in tqdm(preprocessed_documents):
    for word in set(fasttext.tokenize(doc.strip())):
        doc_freqs[word] = doc_freqs.get(word, 0) + 1

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [14]:
num_docs = len(preprocessed_documents)
# idfs = {word: math.log(num_docs / freq) for word, freq in doc_freqs.items()}
idfs = {word: math.log((num_docs - freq + 0.5)/(freq + 0.5)+1) for word, freq in doc_freqs.items()}

In [15]:
from collections import Counter , defaultdict

def aggregate_vector_list(vlist, aggfunc, weights=None, log = False):
    if log: 
        print(weights)
    if aggfunc == 'idf':
        return np.average(vlist, axis=0, weights=weights)
    elif aggfunc == 'mean':
        return np.array(vlist).mean(axis=0)
    else:
        return np.zeros(np.array(vlist).shape[1])

In [16]:
possible_aggfuncs = ["idf"
                     #"mean", 
                     #"inverse_count"
                    ]

aggregated_doc_vectors = {}

# Aggregate vectors of documents beforehand
for aggfunc in tqdm(possible_aggfuncs):
    aggregated_doc_vectors[aggfunc] = np.zeros((len(preprocessed_documents), word_embeddings.shape[1]))
    for index, doc in tqdm(enumerate(preprocessed_documents), total=len(preprocessed_documents)):
        tokenized_doc = fasttext.tokenize(doc.strip())
        vlist = [vector_dict[token] for token in tokenized_doc if token in vector_dict]
        if aggfunc == 'idf':
            weights = [idfs.get(word, 0) for word in tokenized_doc if word in vector_dict]
        elif aggfunc == 'inverse_count':
            counts = Counter(tokenized_doc)
            weights = [1 / counts[word] for word in tokenized_doc if word in vector_dict]
        if(len(vlist) < 1):
            continue 
        else:
            aggregated_doc_vectors[aggfunc][index] = aggregate_vector_list(vlist, aggfunc, weights) 

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [17]:
print(len(vector_dict['rovn\u00c3\u00a1'.lower()]))
print(len(vector_dict['charlemagn']))

450
450


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
def aggregate_query(query, aggfunc, tokenized = False):
    if not tokenized:
        tokens = fasttext.tokenize(query.strip())
    else:
        tokens = query
    if aggfunc == 'idf':
        weights = []
        vector = []
        for word in tokens:
            if word not in vector_dict:
                word = synonyms.get(word, word)
                if type(word) == list:
                    for w in word:
                        if w in vector_dict:
                            weights.append(idfs.get(w, 0))
                            vector.append(vector_dict[w])
                else:
                    if word in vector_dict:
                        weights.append(idfs.get(word, 0))
                        vector.append(vector_dict[word])
            else:
                weights.append(idfs.get(word, 0))
                vector.append(vector_dict[word])
        return aggregate_vector_list(vector, aggfunc, weights)
    elif aggfunc == 'inverse_count':
        counts = Counter(tokens)
        weights = []
        vector = []
        for word in tokens:
            if word not in vector_dict:
                word = synonyms.get(word, word)
                if type(word) == list:
                    for w in word:
                        if w in vector_dict:
                            weights.append(1 / counts[w])
                            vector.append(vector_dict[w])
                else:
                    if word in vector_dict:
                        weights.append(1 / counts[word])
                        vector.append(vector_dict[word])
            else:
                weights.append(1/ counts[word])
                vector.append(vector_dict[word])
        return aggregate_vector_list(vector, aggfunc, weights)
    else:
        return aggregate_vector_list([vector_dict[token] for token in tokens if token in vector_dict], aggfunc)
    
def get_most_similar_documents(query_vector, aggfunc, k = 5):
    query_vector = query_vector.reshape(1, -1)
    # Calculate the similarity with each vector. 
    # Hint: Cosine similarity function takes a matrix as input so you do not need to loop through each document vector.
    sim = cosine_similarity(query_vector, aggregated_doc_vectors[aggfunc])
    # Rank the document vectors according to their cosine similarity with 
    indexes = np.argsort(sim, axis=-1, kind='quicksort', order=None) # This is sorted in ascending order
    indexes = indexes[0]
    indexes = indexes[::-1] # Convert to descending
    return indexes

def search_vec_embeddings(query, topk = 10, aggfunc = 'mean', log=True):
    query_vector = aggregate_query(query, aggfunc)
    #print("Query vector shape: ", query_vector.shape)
    indexes = get_most_similar_documents(query_vector, aggfunc)
    # Print the top k documents
    indexes = indexes[0:topk]
    if log:
        for index in indexes:
            print(f'Document id: {corpus_df.iloc[index].name}')
            print(original_documents[index])
            print()
    return indexes

In [19]:
# preprocess all queries in the queries_df

queries = queries_df['text'].tolist()

In [20]:
from gensim.parsing.preprocessing import STOPWORDS

# remove the word 'call' from stopwords and return the list of stopwords
my_stop_words = STOPWORDS.difference({'call', 'first'})

test = "what is the gateway"
test2 = "In recent years, new chapters became inevitable for the storied statue and its park that had long-served as a sort of gateway from South Scranton onto the massive, landmark Harrison Avenue Bridge. That is because the state Department of Transportation is replacing the old, crumbling bridge."
test3 = 'when were the first call'
test4 = '3/5 of 60'
CUSTOM_FILTERS2 = [lambda x: x.lower(), strip_tags, strip_punctuation, lambda x: strip_short(s=x,minsize=1), strip_multiple_whitespaces, lambda x: remove_stopwords(x, stopwords=my_stop_words), stem_text]
print(preprocess_string(test, CUSTOM_FILTERS))
print(preprocess_string(test2, CUSTOM_FILTERS))
print(preprocess_string(test3, CUSTOM_FILTERS2))

['what', 'is', 'the', 'gatewai']
['in', 'recent', 'year', 'new', 'chapter', 'becam', 'inevit', 'for', 'the', 'stori', 'statu', 'and', 'it', 'park', 'that', 'had', 'long', 'serv', 'as', 'a', 'sort', 'of', 'gatewai', 'from', 'south', 'scranton', 'onto', 'the', 'massiv', 'landmark', 'harrison', 'avenu', 'bridg', 'that', 'is', 'becaus', 'the', 'state', 'depart', 'of', 'transport', 'is', 'replac', 'the', 'old', 'crumbl', 'bridg']
['first', 'call']


In [21]:
# CUSTOM_FILTERS2 = [lambda x: x.lower(), strip_tags, strip_punctuation, lambda x: strip_short(s=x,minsize=1), strip_multiple_whitespaces]

queries = [preprocess_string(query, CUSTOM_FILTERS) for query in tqdm(queries, desc='Preprocessing queries', total=len(queries))]

Preprocessing queries:   0%|          | 0/509962 [00:00<?, ?it/s]

In [22]:
queries_df['text'] = queries

In [23]:
# iterate over all the querys in test data

task1_matrix = np.zeros((len(test_data), word_embeddings.shape[1]))

for index, row in tqdm(test_data.iterrows(), total=len(test_data)):
    query = queries_df.loc[row['query_id']]['text']
    #print(index, 'got query', query, index)
    query_vector = aggregate_query(query, 'idf', True)
    #print(query, len(query_vector))
    task1_matrix[index] = query_vector

  0%|          | 0/7437 [00:00<?, ?it/s]

In [24]:
from sklearn.neighbors import NearestNeighbors

# Create a nearest neighbor model using cosine similarity
# We'll use this to find the 10 most similar documents
nn = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute')

nn.fit(aggregated_doc_vectors['idf'])

In [25]:
_, indices = nn.kneighbors(task1_matrix)

In [None]:
task1_results = indices

In [None]:
# write the results to a pickle file
with open(f'{DATA_DIR}/task1_results.pickle', 'wb') as f:
    pickle.dump(task1_results, f)

In [None]:
# load task2 test data
test_data2 = pd.read_csv(f'{DATA_DIR}/task2_test.tsv', delimiter='\t')

In [None]:
corpus_df.loc[1036904]

text    Anti-Money Laundering (AML) Source Tool for Br...
Name: 1036904, dtype: object

In [None]:
corpus_df.iloc[1460658]

text    Anti-Money Laundering (AML) Source Tool for Br...
Name: 1036904, dtype: object

In [None]:
# iterate row by row
task2_results = []
for index, row in tqdm(test_data2.iterrows(), total=len(test_data2)):
    query = queries_df.loc[row['query-id']]['text']
    query_vector = aggregate_query(query, 'idf', True)
    #print(query, len(query_vector))
    sim = cosine_similarity(query_vector.reshape(1, -1), aggregated_doc_vectors['idf'])
    scores = []
    for corpus_id in eval(row['corpus-id']):
        # get the row index of corpus_id
        corpus_index = corpus_df.index.get_loc(corpus_id)
        scores.append(sim[0][corpus_index])
    task2_results.append(scores)

  0%|          | 0/33 [00:00<?, ?it/s]

In [96]:
# save the results to a pickle file
with open(f'{DATA_DIR}/task2_results.pickle', 'wb') as f:
    pickle.dump(task2_results, f)

In [179]:
# create a csv file for submission
# HEADER: id,corpus-id,score
# task1 results will be: query-id,[corpus-id1, corpus-id2, ...] (top 10 corpus-ids), -1
# task2 results will be query-id, -1, [score1, score2...] 
# create the file

with open(f'{DATA_DIR}/word2vec_submission.csv', 'w') as f:
    id = 0
    f.writelines('id,corpus-id,score\n')
    for i, row in enumerate(task1_results):
        # convert row into a str
        to_write = "\"["
        for j, corpus_index in enumerate(row):
            # get corpus id from corpus_index
            corpus_id = corpus_df.iloc[corpus_index].name
            if j != len(row)-1:
                to_write += str(corpus_id) + ", "
            else:
                to_write += str(corpus_id)
        to_write += "]\""
        f.write(str(id) + "," + to_write + ",-1\n")
        id += 1

    for i,row in enumerate(task2_results):
        query_id_to_write = test_data2.iloc[i]['query-id']
        to_write = "\"["
        for j, score in enumerate(row):
            if j != len(row)-1:
                to_write += str(score) + ", "
            else:
                to_write += str(score) 
        to_write += "]\""
        f.write(str(id) + ",-1," + to_write + "\n")
        id += 1