In [1]:
# inspired from: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Any2Vec_Filebased.ipynb

In [2]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial

DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [3]:
documents = corpus_df.text.values.tolist()

In [4]:
from gensim.parsing.preprocessing import preprocess_string

# # Preprocess the documents
# preprocessed_documents = [preprocess_string(document) for document in tqdm(documents, desc='Preprocessing documents', total=len(documents))]

# from gensim.utils import save_as_line_sentence
# # serialize the preprocessed corpus into a single file on disk, using memory-efficient streaming
# save_as_line_sentence(preprocessed_documents, f'{DATA_DIR}/gensim_preprocessed_documents.txt')

In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument
import multiprocessing as mp


print(f'Using {mp.cpu_count()} cores')
# model = Doc2Vec(corpus_file=f'{DATA_DIR}/gensim_preprocessed_documents.txt', epochs=3, vector_size=600, workers=mp.cpu_count())

# load the model from disk
model = Doc2Vec.load(f'{DATA_DIR}/doc2vec_model')

Using 20 cores


In [6]:
# save the model
# model.save(f'{DATA_DIR}/doc2vec_model')

In [7]:
# get the first query
query = queries_df.iloc[0]['text']

In [8]:
# preprocess the query
preprocessed_query = preprocess_string(query)

In [9]:
query_vector = model.infer_vector(preprocessed_query)

In [10]:
sims = model.dv.most_similar([query_vector])
print(len(sims))

10


In [11]:
print(f'Query: {query}')
print()
for doc_id, score in sims:
    print(f'Document: {corpus_df.iloc[doc_id]["text"]}')
    print(f'Document ID: {corpus_df.index.values[doc_id]}')
    print(f'Similarity: {score}')
    print('-------------------------------------------')

Query: )what was the immediate impact of the success of the manhattan project?

Document: Manhattan: Annual Weather Averages. July is the hottest month in Manhattan with an average temperature of 25Â°C (77Â°F) and the coldest is January at 2Â°C (35Â°F). The wettest month is August with an average of 130mm of rain. Loading weather data.
Document ID: 349384
Similarity: 0.5778313875198364
-------------------------------------------
Document: Information provided about Impact: Impact meaning in Hindi : Get meaning and translation of IMPACT in Hindi language with grammar,antonyms,synonyms and sentence usages. Know answer of question : what is meaning of Impact in Hindi dictionary? Impact ka matalab hindi me kya hai (Impact à¤à¤¾ à¤¹à¤¿à¤à¤¦à¥ à¤®à¥à¤ à¤®à¤¤à¤²à¤¬ ). Impact meaning in Hindi (à¤¹à¤¿à¤¨à¥à¤¦à¥ à¤®à¥ à¤®à¥à¤¨à¤¿à¤à¤ ) is à¤ªà¥à¤°à¤­à¤¾à¤µ.English definition of Impact : the striking of one body against another.
Document ID: 3888081
Similarity: 0.5301682949066162
----

: 