In [1]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial

DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [2]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [3]:
len(corpus_df)

1471406

In [4]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

In [5]:
# # import partial
# from functools import partial

# documents = corpus_df['text'].tolist()
# documents = [x.strip() for x in documents]
# documents = [simple_preprocess(doc) for i, doc in tqdm(enumerate(documents))]

In [6]:
#len(documents)

In [7]:
# save as line_sentence
from gensim.utils import save_as_line_sentence
#save_as_line_sentence(documents, f'{DATA_DIR}/tagged_documents_linesentence.txt')

In [8]:
from gensim.test.utils import get_tmpfile

from gensim.models.callbacks import CallbackAny2Vec


class EpochSaver(CallbackAny2Vec):

    '''Callback to save model after each epoch.'''


    def __init__(self, path_prefix):

        self.path_prefix = path_prefix

        self.epoch = 0


    def on_epoch_end(self, model):

        output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))

        model.save(output_path)

        self.epoch += 1

In [9]:
class EpochLogger(CallbackAny2Vec):

    '''Callback to log information about training'''


    def __init__(self):

        self.epoch = 0


    def on_epoch_begin(self, model):

        print("Epoch #{} start".format(self.epoch))


    def on_epoch_end(self, model):

        print("Epoch #{} end".format(self.epoch))
        #print loss
        print("Loss: {}".format(model.get_latest_training_loss()))

        self.epoch += 1

In [10]:
logger = EpochLogger()
model = Doc2Vec(vector_size=400, epochs=20, workers=mp.cpu_count(), dm=0, sample=0.00001, window=20, min_count=1)
model.build_vocab(corpus_file=f'{DATA_DIR}/tagged_documents_linesentence.txt')

In [11]:
model.train(corpus_file=f'{DATA_DIR}/tagged_documents_linesentence.txt', total_examples=len(corpus_df), total_words=model.corpus_total_words, epochs=model.epochs, report_delay=10, callbacks=[logger], compute_loss=True)

Epoch #0 start
Epoch #0 end
Loss: 0.0
Epoch #1 start
Epoch #1 end
Loss: 0.0
Epoch #2 start
Epoch #2 end
Loss: 0.0
Epoch #3 start
Epoch #3 end
Loss: 0.0
Epoch #4 start
Epoch #4 end
Loss: 0.0
Epoch #5 start
Epoch #5 end
Loss: 0.0
Epoch #6 start
Epoch #6 end
Loss: 0.0
Epoch #7 start
Epoch #7 end
Loss: 0.0
Epoch #8 start
Epoch #8 end
Loss: 0.0
Epoch #9 start
Epoch #9 end
Loss: 0.0
Epoch #10 start
Epoch #10 end
Loss: 0.0
Epoch #11 start
Epoch #11 end
Loss: 0.0
Epoch #12 start
Epoch #12 end
Loss: 0.0
Epoch #13 start
Epoch #13 end
Loss: 0.0
Epoch #14 start
Epoch #14 end
Loss: 0.0
Epoch #15 start
Epoch #15 end
Loss: 0.0
Epoch #16 start
Epoch #16 end
Loss: 0.0
Epoch #17 start
Epoch #17 end
Loss: 0.0
Epoch #18 start
Epoch #18 end
Loss: 0.0
Epoch #19 start
Epoch #19 end
Loss: 0.0


In [12]:
len(model.docvecs[0])

  len(model.docvecs[0])


400

In [13]:
# import get tmpfile
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("doc2vec_model2")
model.save(f'{DATA_DIR}/doc2vec_model2.model')
# load model
#model = Doc2Vec.load(f'{DATA_DIR}/larger_400_doc2vec.model')

In [14]:
# get the first query
query = queries_df.iloc[0]['text']
query = simple_preprocess(query)
query

['what',
 'was',
 'the',
 'immediate',
 'impact',
 'of',
 'the',
 'success',
 'of',
 'the',
 'manhattan',
 'project']

In [15]:
query_vector = model.infer_vector(query)

In [16]:
sims = model.docvecs.most_similar([query_vector]) #gives you top 10 document tags and their cosine similarity

  sims = model.docvecs.most_similar([query_vector]) #gives you top 10 document tags and their cosine similarity


In [17]:
sims

[(414557, 0.6452532410621643),
 (989485, 0.6444576382637024),
 (1214391, 0.6429510712623596),
 (45110, 0.6416457891464233),
 (1241283, 0.6410707831382751),
 (412685, 0.6367886066436768),
 (1461974, 0.6362625360488892),
 (696563, 0.6349706649780273),
 (906517, 0.6323652267456055),
 (309150, 0.6317585110664368)]

In [18]:
print(f'Query: {query}')
print()
for doc_id, score in sims:
    print(f'Document: {corpus_df.iloc[doc_id]["text"]}')
    print(f'Document ID: {corpus_df.index.values[doc_id]}')
    print(f'Similarity: {score}')
    print('-------------------------------------------')

Query: ['what', 'was', 'the', 'immediate', 'impact', 'of', 'the', 'success', 'of', 'the', 'manhattan', 'project']

Document: 1 The Giant Sequoia named The General Sherman Tree reigns supreme as the largest of the living things on earth. 2  This tree is so large that it's seemingly small growth rate of only one millimeter per year yields a volume of new wood equal to that of all the wood found in a 50 foot tree! Since there is an average of 200 seeds per cone, 400,000 seeds could be released from each tree each year. 2  With an average of three mature trees per acre, over a million seeds are produced per acre per year in most Sequoia groves. 3  Giant Sequoias can provide food for themselves (and others).
Document ID: 6550119
Similarity: 0.6452532410621643
-------------------------------------------
Document: Basic Skills For Kids (KIM-9117CD) by William Janiak is an enticing collection of fun and educational songs aimed at teaching a child a solid understanding of body part identificati