# doc2vec

In [29]:
import numpy as np
import glob
import os
import gensim
import multiprocessing as mp
from timeit import default_timer as timer
from datetime import timedelta
import pandas as pd
import random
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import cosine as cosine_distance
import re
from sklearn.base import BaseEstimator

import logging
logging.basicConfig(level=logging.INFO)

Load acceptable categories, metadata, filenames.

In [6]:
acceptable_categories = ['astro-ph.SR',
                         'astro-ph.CO',
                         'astro-ph.GA',
                         'astro-ph.HE',
                         'astro-ph.EP',
                         'astro-ph.IM']

filenames = np.array(glob.glob('/Volumes/BRIENNAKH/Thesis/data/2020_06_12_abstract_tokens/numtoken/*.npy'))
print('Number of documents: ' + str(len(filenames)))

metadata = pd.read_csv('/Volumes/BRIENNAKH/Thesis/data/2020_03_06_arxiv_metadata_astroph/arxiv_metadata_astroph.csv',
                       dtype={'filename': str,
                              'filename_parsed': str,
                              'identifier': str,
                              'updated': str,
                              'doi': str}, 
                       parse_dates=['date_retrieved'])

Number of documents: 139878


Write the iterator that will load and then close each document, one at a time. 

In [21]:
class DocumentIterator(object):
    def load_documents(self):
        for filename in self.filenames: 
            loaded_file = np.load(filename)
            tag = os.path.splitext(os.path.basename(filename))[0]
            abstract = gensim.models.doc2vec.TaggedDocument(words=loaded_file, tags=[tag])
            try:
                yield abstract
            except Exception as e:
                print('Error!' + str(e))
            finally:
                del loaded_file
                # print('Closed ' + tag)
                
    def __init__(self, filenames):
        self.filenames = filenames
        #self.documents = self.load_documents()

    def __iter__(self):
        self.documents = self.load_documents() # Reset the iterator
        return self
    
    def __next__(self):
        abstract = next(self.documents)
        return abstract

## Write GridSearch code, troubleshooting on mini corpus

Confirm mini corpus trains quickly.

In [22]:
mini_corpus = DocumentIterator(filenames[:1000])
model = gensim.models.doc2vec.Doc2Vec(vector_size=100,
                                      alpha=0.025,
                                      min_alpha=0.001, 
                                      min_count=1,
                                      workers=mp.cpu_count(),
                                      dm=1,
                                      epochs=10)
model.build_vocab(mini_corpus)
model.train(mini_corpus, total_examples=model.corpus_count, epochs=model.epochs)
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_19_doc2vec_1000_docs/model.d2v')

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:collected 9670 word types and 1000 unique tags from a corpus of 1000 examples and 187912 words
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=1 retains 9670 unique words (100% of original 9670, drops 0)
INFO:gensim.models.word2vec:effective_min_count=1 leaves 187912 word corpus (100% of original 187912, drops 0)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 9670 items
INFO:gensim.models.word2vec:sample=0.001 downsamples 31 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 141930 word corpus (75.5% of prior 187912)
INFO:gensim.models.base_any2vec:estimated required memory for 9670 words and 100 dimensions: 13171000 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.base_any2

Select parameters to optimize. 

- dm 
    - 0: "distributed bag of words" (PV-DBOW)
    - 1: "distributed memory" (PV-DM)
- hs 
    - 0: "hierarchical softmax" used for model training
    - 1: negative sampling will be used for model training (if negative is non-zero too)
    
"By following a grid-search strategy, we optimized six parameters to train more than 1,900 models."

In [50]:
parameters = {
    'dm': [0, 1],
    'vector_size': [100, 300]
}

In [62]:
class Doc2VecModel(BaseEstimator):
    '''The Estimator that will be used for GridSearch.
    In __init__ we take parameters for a specific model,
    then the GridSearch will call fit on this model.'''
    
    def __init__(self, dm=1, vector_size=100, window=1):
        '''Must match all parameters in my param dict.'''
        self.model = None
        self.vector_size = vector_size
        self.window = window
        self.dm = dm

    def fit(self, train_set, y=None):
        '''y = None cuz this is unsupervised training.'''
        self.model = gensim.models.doc2vec.Doc2Vec(vector_size=self.vector_size, 
                                                   window=self.window, 
                                                   dm=self.dm, 
                                                   epochs=10,
                                                   alpha=0.025, 
                                                   min_alpha=0.001)
        docs = DocumentIterator(train_set)
        self.model.build_vocab(docs)
        self.model.train(docs, total_examples=self.model.corpus_count, epochs=self.model.epochs)
        return self

    def transform(self, raw_documents):
        X = []
        for index, row in raw_documents.iteritems():
            X.append(self.model.infer_vector(row))
        X = pd.DataFrame(X, index=raw_documents.index)
        return X

    def fit_transform(self, raw_documents, y=None):
        self.fit(raw_documents)
        return self.transform(raw_documents)

Set up GridSearch. Helpful [tutorial](https://medium.com/datadriveninvestor/an-introduction-to-grid-search-ff57adcc0998).

In [68]:
def my_scorer(x, y=None):
    '''x is the Doc2VecModel() object. I guess y is nothing? IDK what y is.'''
    triplets = np.load('/Volumes/BRIENNAKH/Thesis/results/2020_06_19_doc2vec_1000_docs/100_triplets.npy')
    accuracy = evaluate(x.model, triplets)
    return max(accuracy)

grid = GridSearchCV(estimator=Doc2VecModel(),
                   param_grid=parameters,
                   cv=5, 
                   scoring=my_scorer, 
                   verbose=0, 
                   n_jobs=-1)

grid_result = grid.fit(train_set)

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:collected 9670 word types and 1000 unique tags from a corpus of 1000 examples and 187912 words
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=5 retains 3499 unique words (36% of original 9670, drops 6171)
INFO:gensim.models.word2vec:effective_min_count=5 leaves 177053 word corpus (94% of original 187912, drops 10859)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 9670 items
INFO:gensim.models.word2vec:sample=0.001 downsamples 33 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 130195 word corpus (73.5% of prior 177053)
INFO:gensim.models.base_any2vec:estimated required memory for 3499 words and 100 dimensions: 5148700 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.base_

Train the model. Do I need to include seed if I'm doing gridsearch over hyperparameters? Ask this question on google groups.

In [69]:
grid_result.best_params_

{'dm': 1, 'vector_size': 100}

In [None]:
model.load()

In [None]:
docs = DocumentIterator(filenames)
model = gensim.models.doc2vec.Doc2Vec(vector_size=300,
                               alpha=0.025,
                               min_alpha=0.001, 
                               min_count=1,
                workers=mp.cpu_count(),
                dm=1,
                epochs=10)

# Build vocabulary
start = timer()
model.build_vocab(docs)
end = timer()
print('Built vocabulary!')
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

# Train the model
start = timer()
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model')
print('Model Saved!')
end = timer()
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

In [None]:
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model')
print('Model Saved!')
end = timer()
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

## Evaluate

## Generate triplets

Check Dai, Olah, and Le's triplets to see how they are generated. B has any category from A, C shares no categories with A but may share categories with B. Follows what they said in their paper.

In [None]:
with open('/Volumes/BRIENNAKH/Thesis/data/2020_04_24_dai_olah_le_triplets/arxiv_2014_09_27_examples.txt', 'r') as file:
    daiolahle_triplets = [(line[0], line[1], line[2]) for line in (line.replace('\n', '').split(' ') for line in file.readlines())]

daiolahle_categories = []
for triplet in daiolahle_triplets:
    doc1_id = re.search(r'\w*-?\w*[/.]\d*$', triplet[0])[0].replace('/', '')
    doc2_id = re.search(r'\w*-?\w*[/.]\d*$', triplet[1])[0].replace('/', '')
    doc3_id = re.search(r'\w*-?\w*[/.]\d*$', triplet[2])[0].replace('/', '')
    cats1 = metadata[metadata['filename_parsed'] == doc1_id]['categories']
    cats2 = metadata[metadata['filename_parsed'] == doc2_id]['categories']
    cats3 = metadata[metadata['filename_parsed'] == doc3_id]['categories']
    if any(cats1) and any(cats2) and any(cats3):
        print((cats1.iloc[0], cats2.iloc[0], cats3.iloc[0]))
        # Check if C is in A or B
        print(any(np.isin(cats3.iloc[0].split(' '), cats1.iloc[0].split(' '))) or any(np.isin(cats3.iloc[0].split(' '), cats2.iloc[0].split(' '))))

In [64]:
def generate_triplet(data):
    '''Generates a triplet by randomly selecting 
        - an anchor paper A
        - a paper B that shares any of its categories with A  
        - a paper C which shares no categories with A
    
    Returns a tuple (A, B, C) where each item is the paper's arXiv ID/filename.'''
    # Select Paper A
    a_id = random.choice(data['filename_parsed'].tolist())
    a = data[data['filename_parsed'] == a_id]
    print('\nRandomly chose Paper A from ' + str((len(data))) + ' eligible papers...')
    print('Paper A: ' + a_id)
    a_categories = [x for x in a['categories'].iloc[0].split(' ') if x in acceptable_categories]
    print('Categories: ' + str(a_categories))
    data.drop(a.index, inplace=True) # Remove A from data
    
    # Select Paper B
    b_set = data[data['categories'].str.split(' ').apply(lambda x: np.any(np.isin(x, a_categories)))]
    b_id = random.choice(b_set['filename_parsed'].tolist())
    b = data[data['filename_parsed'] == b_id]
    print('Randomly chose Paper B from ' + str(len(b_set)) + ' eligible papers...')
    print('Paper B: ' + b_id)
    b_categories = [x for x in b['categories'].iloc[0].split(' ') if x in acceptable_categories]
    print('Categories: ' + str(b_categories))
        
    # Select Paper C
    papers_with_a_categories = data[data['categories'].str.split(' ').apply(lambda x: np.any(np.isin(x, a_categories)))]
    c_set = data.merge(papers_with_a_categories, how='outer', indicator=True).loc[lambda x: x['_merge']=='left_only'] 
    c_id = random.choice(c_set['filename_parsed'].tolist())
    c = data[data['filename_parsed'] == c_id]
    print('Randomly chose Paper C from ' + str(len(c_set)) + ' eligible papers...')
    print('Paper C: ' + c_id)
    c_categories = [x for x in c['categories'].iloc[0].split(' ') if x in acceptable_categories]
    print('Categories: ' + str(c_categories))
    
    print(len(papers_with_a_categories) + len(c_set) == len(data))
    data.drop([b.index[0], c.index[0]], inplace=True)
    return (a_id, b_id, c_id)

def generate_triplets(train_set, num_of_triplets):
    data = metadata[metadata['filename_parsed'].isin([os.path.splitext(os.path.basename(x))[0] for x in train_set])]
    triplets = []
    while len(triplets) < num_of_triplets:
        try:
            triplets.append(generate_triplet(data))
            print('Number of triplets: ' + str(len(triplets)))
        except Exception as e: # If we end up with too few choices for C, catch exception and reselect A
            print(e)
    return triplets


def get_vector(model, doc_id):
    '''Takes a Pandas row for a paper.'''
    loaded = np.load('/Volumes/BRIENNAKH/Thesis/data/2020_06_09_abstract_tokens/' + doc_id + '.npy')
    return model.infer_vector(loaded)

def calculate_accuracy(model, triplet):
    '''Calculates accuracy of one given triplet using two formulas.
    [ab > ac, ab > bc]. 1= yes, 0 = no.'''
    a_vector = get_vector(model, triplet[0])
    b_vector = get_vector(model, triplet[1])
    c_vector = get_vector(model, triplet[2])
    ab_dist = cosine_distance(a_vector, b_vector)
    ac_dist = cosine_distance(a_vector, c_vector)
    bc_dist = cosine_distance(b_vector, c_vector)
    if ab_dist > ac_dist and ab_dist > bc_dist:
        return [1, 1]
    elif ab_dist > ac_dist and ab_dist < bc_dist:
        return [1, 0]
    elif ab_dist < ac_dist and ab_dist > bc_dist:
        return [0, 1]
    else:
        return [0, 0]
    
def evaluate(model, triplets):
    '''Calculates averaged accuracy of all given triplets.
    [accuracy that ab is more similar than ac, ab more similar than bc]'''
    accuracies = [0, 0]
    for triplet in triplets:
        accuracy = calculate_accuracy(model, triplet)
        accuracies[0] += accuracy[0]
        accuracies[1] += accuracy[1]
    return [accuracies[0]/len(triplets), accuracies[1]/len(triplets)]

Generate a small set of triplets. 

In [14]:
train_set = filenames[:1000]
triplets = generate_triplets(train_set, 100)
np.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_19_doc2vec_1000_docs/100_triplets.npy', triplets)



Randomly chose Paper A from 1000 eligible papers...
Paper A: 0901.0670
Categories: ['astro-ph.GA']
Randomly chose Paper B from 295 eligible papers...
Paper B: 0901.0915
Categories: ['astro-ph.GA']
Randomly chose Paper C from 704 eligible papers...
Paper C: 0901.3875
Categories: ['astro-ph.EP']
True
Number of triplets: 1

Randomly chose Paper A from 997 eligible papers...
Paper A: 0901.0555
Categories: ['astro-ph.CO']
Randomly chose Paper B from 302 eligible papers...
Paper B: 0901.3108
Categories: ['astro-ph.CO', 'astro-ph.HE']
Randomly chose Paper C from 694 eligible papers...
Paper C: 0901.2702
Categories: ['astro-ph.IM', 'astro-ph.SR']
True
Number of triplets: 2

Randomly chose Paper A from 994 eligible papers...
Paper A: 0901.0198
Categories: ['astro-ph.SR', 'astro-ph.CO', 'astro-ph.GA']
Randomly chose Paper B from 776 eligible papers...
Paper B: 0901.2664
Categories: ['astro-ph.IM', 'astro-ph.CO']
Randomly chose Paper C from 217 eligible papers...
Paper C: 0901.2858
Categories: [

Randomly chose Paper C from 653 eligible papers...
Paper C: 0901.1296
Categories: ['astro-ph.HE']
True
Number of triplets: 26

Randomly chose Paper A from 922 eligible papers...
Paper A: 0902.0305
Categories: ['astro-ph.SR']
Randomly chose Paper B from 269 eligible papers...
Paper B: 0901.1401
Categories: ['astro-ph.GA', 'astro-ph.SR']
Randomly chose Paper C from 652 eligible papers...
Paper C: 0901.1049
Categories: ['astro-ph.IM', 'astro-ph.CO']
True
Number of triplets: 27

Randomly chose Paper A from 919 eligible papers...
Paper A: 0902.0577
Categories: ['astro-ph.CO']
Randomly chose Paper B from 278 eligible papers...
Paper B: 0901.2584
Categories: ['astro-ph.CO']
Randomly chose Paper C from 640 eligible papers...
Paper C: 0901.1225
Categories: ['astro-ph.HE']
True
Number of triplets: 28

Randomly chose Paper A from 916 eligible papers...
Paper A: 0901.3321
Categories: ['astro-ph.GA']
Randomly chose Paper B from 266 eligible papers...
Paper B: 0901.1579
Categories: ['astro-ph.GA', '

Randomly chose Paper B from 248 eligible papers...
Paper B: 0901.4116
Categories: ['astro-ph.GA']
Randomly chose Paper C from 598 eligible papers...
Paper C: 0901.1691
Categories: ['astro-ph.SR']
True
Number of triplets: 52

Randomly chose Paper A from 844 eligible papers...
Paper A: 0901.4258
Categories: ['astro-ph.HE']
Randomly chose Paper B from 157 eligible papers...
Paper B: 0901.1566
Categories: ['astro-ph.HE', 'astro-ph.GA']
Randomly chose Paper C from 686 eligible papers...
Paper C: 0901.2458
Categories: ['astro-ph.GA', 'astro-ph.CO']
True
Number of triplets: 53

Randomly chose Paper A from 841 eligible papers...
Paper A: 0902.0011
Categories: ['astro-ph.GA', 'astro-ph.CO']
Randomly chose Paper B from 437 eligible papers...
Paper B: 0901.4046
Categories: ['astro-ph.GA']
Randomly chose Paper C from 403 eligible papers...
Paper C: 0901.0202
Categories: ['astro-ph.SR']
True
Number of triplets: 54

Randomly chose Paper A from 838 eligible papers...
Paper A: 0902.0036
Categories: ['

Randomly chose Paper B from 275 eligible papers...
Paper B: 0902.0384
Categories: ['astro-ph.GA', 'astro-ph.CO', 'astro-ph.SR']
Randomly chose Paper C from 493 eligible papers...
Paper C: 0901.4627
Categories: ['astro-ph.CO']
True
Number of triplets: 78

Randomly chose Paper A from 766 eligible papers...
Paper A: 0901.4284
Categories: ['astro-ph.GA']
Randomly chose Paper B from 221 eligible papers...
Paper B: 0901.1089
Categories: ['astro-ph.GA']
Randomly chose Paper C from 544 eligible papers...
Paper C: 0901.0632
Categories: ['astro-ph.CO']
True
Number of triplets: 79

Randomly chose Paper A from 763 eligible papers...
Paper A: 0902.0358
Categories: ['astro-ph.IM']
Randomly chose Paper B from 52 eligible papers...
Paper B: 0901.4684
Categories: ['astro-ph.IM', 'astro-ph.GA']
Randomly chose Paper C from 710 eligible papers...
Paper C: 0901.3593
Categories: ['astro-ph.SR', 'astro-ph.EP']
True
Number of triplets: 80

Randomly chose Paper A from 760 eligible papers...
Paper A: 0902.0376


[0.46, 0.46]

In [16]:
my_scorer()

NameError: name 'mini_triplets' is not defined

Use the same set of triplets for every evaluation.

In [None]:
evaluate(triplets)

In [None]:
len(triplets)

Are the inferred vectors close to the actual ones?

In [None]:
doc_id = random.choice(list(model.docvecs.doctags))
doc = np.load('/Volumes/BRIENNAKH/Thesis/data/2020_06_09_abstract_tokens/' + doc_id + '.npy')
inferred_docvec = model.infer_vector(doc)
print(doc_id)
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=5)))

In [None]:
analogy_results = model.wv.evaluate_word_analogies('/Volumes/BRIENNAKH/Thesis/data/2020_06_12_analogies/analogies.txt')
analogy_results[1][0]

In [None]:
print('Words in model: ' + str(len(model.wv.vocab.keys())))

In [None]:
model.wv.most_similar('NASA')