# doc2vec

In [7]:
import numpy as np
import glob
import gensim
import os
import multiprocessing as mp
from timeit import default_timer as timer
from datetime import timedelta
import pandas as pd
import random
from sklearn.model_selection import GridSearchCV, ParameterGrid
from scipy.spatial.distance import cosine as cosine_distance
import re
from sklearn.base import BaseEstimator

Load training data.

In [14]:
import sys
FILE_DIR = os.path.dirname(os.path.abspath('__file__'))
sys.path.append(FILE_DIR)
datapath = os.path.join(FILE_DIR, 'abstracts/' + 'numtoken' + '/*.npy')
loaded = os.path.join(FILE_DIR, 'abstracts/' + '233' + '.npy')
loaded

'/Volumes/BRIENNAKH/Thesis/notebooks/abstracts/233.npy'

In [9]:
data = np.array(glob.glob('/Volumes/BRIENNAKH/Thesis/data/2020_06_12_abstract_tokens/numtoken/*.npy'))[:1000]
print('Number of documents: ' + str(len(data)))

Number of documents: 1000


Write the iterator that will load and then close each document, one at a time. 

In [None]:
class DocumentIterator(object):
    def load_documents(self):
        for filename in self.filenames: 
            loaded_file = np.load(filename)
            tag = os.path.splitext(os.path.basename(filename))[0]
            abstract = gensim.models.doc2vec.TaggedDocument(words=loaded_file, tags=[tag])
            try:
                yield abstract
            except Exception as e:
                print('Error!' + str(e))
            finally:
                del loaded_file
                # print('Closed ' + tag)
                
    def __init__(self, filenames):
        self.filenames = filenames
        #self.documents = self.load_documents()

    def __iter__(self):
        self.documents = self.load_documents() # Reset the iterator
        return self
    
    def __next__(self):
        abstract = next(self.documents)
        return abstract

## Write GridSearch code, troubleshooting on mini corpus

Confirm mini corpus trains quickly.

In [4]:
mini_corpus = DocumentIterator(filenames[:1000])
model = gensim.models.doc2vec.Doc2Vec(vector_size=100,
                                      alpha=0.025,
                                      min_alpha=0.001,
                                      min_count=1,
                                      workers=mp.cpu_count(),
                                      dm=1,
                                      epochs=10)
model.build_vocab(mini_corpus)
model.train(mini_corpus, total_examples=model.corpus_count, epochs=model.epochs)
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_19_doc2vec_1000_docs/model.d2v')

NameError: name 'DocumentIterator' is not defined

Select parameters to optimize. 

- dm 
    - 0: "distributed bag of words" (PV-DBOW)
    - 1: "distributed memory" (PV-DM)
- hs 
    - 0: "hierarchical softmax" used for model training
    - 1: negative sampling will be used for model training (if negative is non-zero too)
    
"By following a grid-search strategy, we optimized six parameters to train more than 1,900 models."

In [3]:
class Doc2VecModel(BaseEstimator):
    '''The Estimator that will be used for GridSearch.
    In __init__ we take parameters for a specific model,
    then the GridSearch will call fit on this model.'''
    
    def __init__(self, dm=1, vector_size=100, window=1):
        '''Must match all parameters in my param dict.'''
        self.model = None
        self.vector_size = vector_size
        self.window = window
        self.dm = dm

    def fit(self, train_set, triplets_path):
        '''y = None cuz this is unsupervised training.'''
        self.triplets_path = triplets_path
        self.model = gensim.models.doc2vec.Doc2Vec(vector_size=self.vector_size, 
                                                   window=self.window, 
                                                   dm=self.dm, 
                                                   epochs=10,
                                                   alpha=0.025, 
                                                   min_alpha=0.001)
        tagged_docs = DocumentIterator(train_set)
        self.model.build_vocab(tagged_docs)
        self.model.train(docs, total_examples=self.model.corpus_count, epochs=self.model.epochs)
        return self

def evaluate(self, model):
    '''Calculates averaged accuracy of all given triplets.
    [accuracy that ab is more similar than ac, ab more similar than bc]'''
    accuracies = [0, 0]
    triplets = np.load(model.triplets_path)
    for triplet in triplets:
        accuracy = calculate_accuracy(model, triplet)
        accuracies[0] += accuracy[0]
        accuracies[1] += accuracy[1]
    return [accuracies[0]/len(triplets), accuracies[1]/len(triplets)]
    
def scorer(self, estimator):
    accuracy = evaluate(estimator.model)
    return max(accuracy)

def get_vector(model, doc_id):
    '''Takes a Pandas row for a paper.'''
    loaded = np.load('/Volumes/BRIENNAKH/Thesis/data/2020_06_09_abstract_tokens/' + doc_id + '.npy')
    return model.infer_vector(loaded)

def calculate_accuracy(model, triplet):
    '''Calculates accuracy of one given triplet using two formulas.
    [ab > ac, ab > bc]. 1= yes, 0 = no.'''
    a_vector = get_vector(model, triplet[0])
    b_vector = get_vector(model, triplet[1])
    c_vector = get_vector(model, triplet[2])
    ab_dist = cosine_distance(a_vector, b_vector)
    ac_dist = cosine_distance(a_vector, c_vector)
    bc_dist = cosine_distance(b_vector, c_vector)
    if ab_dist > ac_dist and ab_dist > bc_dist:
        return [1, 1]
    elif ab_dist > ac_dist and ab_dist < bc_dist:
        return [1, 0]
    elif ab_dist < ac_dist and ab_dist > bc_dist:
        return [0, 1]
    else:
        return [0, 0]

Set up GridSearch. Helpful [tutorial](https://medium.com/datadriveninvestor/an-introduction-to-grid-search-ff57adcc0998).

In [18]:
from sklearn.model_selection import ParameterGrid

params = {'dm': [0, 1],
          'vector_size': [100, 200, 300], 
          'corpus': ['numtoken'],
          'hs': [0, 1]}

for i in ParameterGrid(params):
    print(i)
    print(i.values())
    
len(list(ParameterGrid(params)))

{'corpus': 'numtoken', 'dm': 0, 'hs': 0, 'vector_size': 100}
dict_values(['numtoken', 0, 0, 100])
{'corpus': 'numtoken', 'dm': 0, 'hs': 0, 'vector_size': 200}
dict_values(['numtoken', 0, 0, 200])
{'corpus': 'numtoken', 'dm': 0, 'hs': 0, 'vector_size': 300}
dict_values(['numtoken', 0, 0, 300])
{'corpus': 'numtoken', 'dm': 0, 'hs': 1, 'vector_size': 100}
dict_values(['numtoken', 0, 1, 100])
{'corpus': 'numtoken', 'dm': 0, 'hs': 1, 'vector_size': 200}
dict_values(['numtoken', 0, 1, 200])
{'corpus': 'numtoken', 'dm': 0, 'hs': 1, 'vector_size': 300}
dict_values(['numtoken', 0, 1, 300])
{'corpus': 'numtoken', 'dm': 1, 'hs': 0, 'vector_size': 100}
dict_values(['numtoken', 1, 0, 100])
{'corpus': 'numtoken', 'dm': 1, 'hs': 0, 'vector_size': 200}
dict_values(['numtoken', 1, 0, 200])
{'corpus': 'numtoken', 'dm': 1, 'hs': 0, 'vector_size': 300}
dict_values(['numtoken', 1, 0, 300])
{'corpus': 'numtoken', 'dm': 1, 'hs': 1, 'vector_size': 100}
dict_values(['numtoken', 1, 1, 100])
{'corpus': 'numtoken

12

In [69]:
from joblib import Parallel, parallel_backend, register_parallel_backend, delayed
from ipyparallel.joblib import IPythonParallelBackend
from ipyparallel import Client
import sys 
import os
import pickle
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)
from sklearn.model_selection import ParameterGrid
from thiswillwork import train_and_evaluate

start = timer()
params = {
    'dm': [0, 1],
    'vector_size': [100, 300]
}

FILE_DIR = os.path.dirname(os.path.join(os.path.abspath(''), 'notebooks'))
sys.path.append(FILE_DIR)

profile = 'ipy_profile'
c = Client(profile=profile)

triplets = np.load('/s/BRIENNAKH/Thesis/results/2020_06_19_doc2vec_1000_docs/100_triplets.npy')

# Make sure that each engine is running in the right 
# working directory to access the custom function(s)
c[:].map(os.chdir, [FILE_DIR]*len(c))
logging.info("c.ids :{0}".format(str(c.ids)))
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel', lambda : IPythonParallelBackend(view=bview))

with parallel_backend('ipyparallel'):
    scores = Parallel(n_jobs=len(c))(delayed(train_and_evaluate)(data, triplets, p) for p in ParameterGrid(params))
    scores_formatted = {k: v for d in scores for k, v in d.items()}
    with open('scores.pkl', 'wb') as f:
        pickle.dump(scores_formatted, f, pickle.HIGHEST_PROTOCOL)
    
end = timer()
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

INFO:root:c.ids :[0, 1, 2, 3]


Execution time HH:MM:SS: 0:00:26.583568


In [None]:
{1: {'parameters': {'corpus': 'numtoken', 'dm': 1, 'vector_size': 200}, 'score': 0.45}}

In [1]:
import pickle
import pandas as pd
with open('/Volumes/BRIENNAKH/Thesis/src/2020_06_20_doc2vec_slurm_pkg/scores.pkl', 'rb') as f:
    scores = pickle.load(f)
    best_parameters = scores[max(scores.keys())]
    print(best_parameters)
    print(scores.keys())

{'parameters': {'corpus': 'numtoken', 'dm': 1, 'vector_size': 1000}, 'score': 0.44}
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])


In [56]:
scores_df = pd.DataFrame(scores).transpose()
scores_series = scores_df['score']
scores_df = scores_df['parameters'].apply(pd.Series)
scores_df['score'] = scores_series
scores_df

Unnamed: 0,corpus,dm,vector_size,score
0,numtoken,0,100,0.4
1,numtoken,0,200,0.38
2,numtoken,0,300,0.4
3,numtoken,0,400,0.39
4,numtoken,0,500,0.4
5,numtoken,0,600,0.42
6,numtoken,0,700,0.42
7,numtoken,0,800,0.41
8,numtoken,0,900,0.44
9,numtoken,0,1000,0.42


In [None]:
triplets_path = '/Volumes/BRIENNAKH/Thesis/results/2020_06_19_doc2vec_1000_docs/100_triplets.npy'


grid = GridSearchCV(estimator=Doc2VecModel(),
                   param_grid=params,
                   cv=5, 
                   scoring=scorer, 
                   verbose=0, 
                   n_jobs=-1)

grid_result = grid.fit(train_set, triplets_path)

In [None]:
D2VTransformer(DocumentIterator(train_set)).get_params().keys()

Train the model. Do I need to include seed if I'm doing gridsearch over hyperparameters? Ask this question on google groups.

In [None]:
grid_result.best_params_

In [None]:
grid_result.cv_results_

In [None]:
model.load()


In [None]:
docs = DocumentIterator(filenames)
model = gensim.models.doc2vec.Doc2Vec(vector_size=300,
                               alpha=0.025,
                               min_alpha=0.001, 
                               min_count=1,
                workers=mp.cpu_count(),
                dm=1,
                epochs=10)

# Build vocabulary
start = timer()
model.build_vocab(docs)
end = timer()
print('Built vocabulary!')
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

# Train the model
start = timer()
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model')
print('Model Saved!')
end = timer()
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

In [None]:
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model')
print('Model Saved!')
end = timer()
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

## Evaluate

Are the inferred vectors close to the actual ones?

In [None]:
doc_id = random.choice(list(model.docvecs.doctags))
doc = np.load('/Volumes/BRIENNAKH/Thesis/data/2020_06_09_abstract_tokens/' + doc_id + '.npy')
inferred_docvec = model.infer_vector(doc)
print(doc_id)
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=5)))

In [None]:
analogy_results = model.wv.evaluate_word_analogies('/Volumes/BRIENNAKH/Thesis/data/2020_06_12_analogies/analogies.txt')
analogy_results[1][0]

In [None]:
print('Words in model: ' + str(len(model.wv.vocab.keys())))

In [None]:
model.wv.most_similar('NASA')

Cannot use GPU. https://github.com/RaRe-Technologies/gensim/issues/449 

In [10]:
model = gensim.models.Doc2Vec.load('../results/2020_06_10_doc2vec_model/d2v.model')

In [14]:
model.docvecs['1508.02437']

array([-0.00149401,  0.00273189, -0.00425196,  0.00217884,  0.00158152,
       -0.00231829,  0.00190731,  0.00491803, -0.00284187, -0.00442933,
       -0.0044723 , -0.00283791,  0.00352604,  0.0020828 , -0.00205341,
       -0.00454024, -0.00439921, -0.00181873, -0.00223137,  0.00359157,
       -0.0042825 , -0.00327853,  0.00307286, -0.00119451,  0.00275379,
        0.00476028, -0.00437622, -0.00143195, -0.00209123,  0.0030231 ,
        0.00207966, -0.00406045,  0.00305885,  0.00239352,  0.00127178,
       -0.00244026, -0.00463173, -0.00292976,  0.00181851,  0.00249871,
        0.00096328, -0.00249196, -0.00111651,  0.00109859, -0.00014503,
        0.00373352,  0.00173389, -0.00496073,  0.00291012,  0.00439866,
       -0.00458495,  0.00036439, -0.00448882, -0.00445445, -0.00042387,
        0.00222825,  0.00472964,  0.0026782 ,  0.00313185, -0.00106612,
        0.00052141,  0.00268776, -0.00017095,  0.00499439, -0.00288416,
       -0.00314789, -0.00411559, -0.0019212 ,  0.00353793,  0.00