In [1]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial

DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [2]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [3]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [4]:
# import partial
from functools import partial

# Check if documents.pkl exists:
if os.path.isfile(f'{DATA_DIR}/documents.pkl'):
    print('Loading tokenized documents from pickle file...')
    # load the tokenized documents from pickle file
    import pickle
    with open(f'{DATA_DIR}/documents.pkl', 'rb') as f:
        documents = pickle.load(f)
else:
    print('File not found. Tokenizing documents...')
    documents = corpus_df['text'].tolist()
    documents = [x.strip() for x in documents]
    # use multiprocessing to speed up the process
    pool = mp.Pool(mp.cpu_count())
    # pass both documents and stemmer as arguments to the tokenize function
    fn = partial(helpers.tokenize, stemmer=stemmer)   
    documents = list(tqdm(pool.imap(fn, documents), total=len(documents))) 
    # save the tokenized documents as pickle file
    import pickle
    with open(f'{DATA_DIR}/documents.pkl', 'wb') as f:
        pickle.dump(documents, f)

Loading tokenized documents from pickle file...


In [5]:
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in tqdm(enumerate(documents))]

0it [00:00, ?it/s]

In [6]:
model = Doc2Vec(vector_size=400, epochs=40, workers=mp.cpu_count())
model.build_vocab(tagged_documents)


In [7]:
model.train(tqdm(tagged_documents), total_examples=len(tagged_documents), epochs=model.epochs)

  0%|          | 0/1471406 [00:00<?, ?it/s]

In [8]:
len(model.docvecs[0])

  len(model.docvecs[0])


400

In [9]:
# import get tmpfile
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("larger_400_doc2vec.model")
model.save(f'{DATA_DIR}/larger_400_doc2vec.model')
# load model
#model = Doc2Vec.load(f'{DATA_DIR}/larger_400_doc2vec.model')

In [10]:
# get the first query
query = queries_df.iloc[0]['text']

In [11]:
query_vector = model.infer_vector(helpers.tokenize(query, stemmer=stemmer).split())

In [12]:
sims = model.docvecs.most_similar([query_vector]) #gives you top 10 document tags and their cosine similarity

  sims = model.docvecs.most_similar([query_vector]) #gives you top 10 document tags and their cosine similarity


In [13]:
query_vector.shape

(400,)

In [14]:
sims

[(1232382, 0.46403947472572327),
 (1081637, 0.42651697993278503),
 (1092058, 0.4058090150356293),
 (368150, 0.4055427312850952),
 (153597, 0.4009922444820404),
 (943251, 0.3986547887325287),
 (444550, 0.39261817932128906),
 (1397469, 0.3723812699317932),
 (320984, 0.3714607357978821),
 (1451430, 0.37132176756858826)]

In [15]:
print(f'Query: {query}')
print()
for doc_id, score in sims:
    print(f'Document: {corpus_df.iloc[doc_id]["text"]}')
    print(f'Document ID: {corpus_df.index.values[doc_id]}')
    print(f'Similarity: {score}')
    print('-------------------------------------------')

Query: )what was the immediate impact of the success of the manhattan project?

Document: Information provided about Impact: Impact meaning in Hindi : Get meaning and translation of IMPACT in Hindi language with grammar,antonyms,synonyms and sentence usages. Know answer of question : what is meaning of Impact in Hindi dictionary? Impact ka matalab hindi me kya hai (Impact à¤à¤¾ à¤¹à¤¿à¤à¤¦à¥ à¤®à¥à¤ à¤®à¤¤à¤²à¤¬ ). Impact meaning in Hindi (à¤¹à¤¿à¤¨à¥à¤¦à¥ à¤®à¥ à¤®à¥à¤¨à¤¿à¤à¤ ) is à¤ªà¥à¤°à¤­à¤¾à¤µ.English definition of Impact : the striking of one body against another.
Document ID: 3888081
Similarity: 0.46403947472572327
-------------------------------------------
Document: As nouns the difference between compression and impaction is that compression is an increase in density; the act of compressing, or the state of being compressed; compaction while impaction is compression; the packing together of loose matter.
Document ID: 1468309
Similarity: 0.42651697993278503
-----