# doc2vec

In [55]:
import numpy as np
import glob
import os
import gensim
import multiprocessing as mp
from timeit import default_timer as timer
from datetime import timedelta
import pandas as pd
import random
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import cosine as cosine_distance
import re

import logging
logging.basicConfig(level=logging.INFO)

In [98]:
acceptable_categories = ['astro-ph.SR',
                         'astro-ph.CO',
                         'astro-ph.GA',
                         'astro-ph.HE',
                         'astro-ph.EP',
                         'astro-ph.IM']

Collect all filenames.

In [6]:
filenames = np.array(glob.glob('/Volumes/BRIENNAKH/Thesis/data/2020_06_12_abstract_tokens/numtoken/*.npy'))
print('Number of documents: ' + str(len(filenames)))

Number of documents: 139878


Write the iterator that will load and then close each document, one at a time. 

In [18]:
class DocumentIterator(object):
    def load_documents(self):
        for filename in self.filenames: 
            loaded_file = np.load(filename)
            tag = os.path.splitext(os.path.basename(filename))[0]
            abstract = gensim.models.doc2vec.TaggedDocument(words=loaded_file, tags=[tag])
            try:
                yield abstract
            except Exception as e:
                print('Error!' + str(e))
            finally:
                del loaded_file
                # print('Closed ' + tag)
                
    def __init__(self, filenames):
        self.filenames = filenames
        #self.documents = self.load_documents()

    def __iter__(self):
        self.documents = self.load_documents() # Reset the iterator
        return self
    
    def __next__(self):
        abstract = next(self.documents)
        return abstract

## Write GridSearch code, troubleshooting on mini corpus

Confirm mini corpus trains quickly.

In [97]:
mini_corpus = DocumentIterator(filenames[:1000])
model = gensim.models.doc2vec.Doc2Vec(vector_size=100,
                                      alpha=0.025,
                                      min_alpha=0.001, 
                                      min_count=1,
                                      workers=mp.cpu_count(),
                                      dm=1,
                                      epochs=10)
model.build_vocab(mini_corpus)
model.train(mini_corpus, total_examples=model.corpus_count, epochs=model.epochs)

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:collected 9670 word types and 1000 unique tags from a corpus of 1000 examples and 187912 words
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=1 retains 9670 unique words (100% of original 9670, drops 0)
INFO:gensim.models.word2vec:effective_min_count=1 leaves 187912 word corpus (100% of original 187912, drops 0)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 9670 items
INFO:gensim.models.word2vec:sample=0.001 downsamples 31 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 141930 word corpus (75.5% of prior 187912)
INFO:gensim.models.base_any2vec:estimated required memory for 9670 words and 100 dimensions: 13171000 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.base_any2

Select parameters to optimize. 

- dm 
    - 0: "distributed bag of words" (PV-DBOW)
    - 1: "distributed memory" (PV-DM)
- hs 
    - 0: "hierarchical softmax" used for model training
    - 1: negative sampling will be used for model training (if negative is non-zero too)
    
"By following a grid-search strategy, we optimized six parameters to train more than 1,900 models."

In [90]:
parameters = {
    'dm': (0, 1),
    'vector_size': (100, 300)
}

In [None]:
class Doc2VecModel(BaseEstimator):
    '''The Estimator that will be used for GridSearch.
    In __init__ we take parameters for a specific model,
    then the GridSearch will call fit on this model.'''
    
    def __init__(self, dm=1, size=1, window=1):
        self.d2v_model = None
        self.size = size
        self.window = window
        self.dm = dm

    def fit(self, raw_documents, y=None):
        '''y = None cuz this is unsupervised training.'''
        # Initialize model
        self.d2v_model = Doc2Vec(size=self.size, window=self.window, dm=self.dm, iter=5, alpha=0.025, min_alpha=0.001)
        # Tag docs
        tagged_documents = []
        for index, row in raw_documents.iteritems():
            tag = '{}_{}'.format("type", index)
            tokens = row.split()
            tagged_documents.append(TaggedDocument(words=tokens, tags=[tag]))
        # Build vocabulary
        self.d2v_model.build_vocab(tagged_documents)
        # Train model
        self.d2v_model.train(tagged_documents, total_examples=len(tagged_documents), epochs=self.d2v_model.iter)
        return self

    def transform(self, raw_documents):
        X = []
        for index, row in raw_documents.iteritems():
            X.append(self.d2v_model.infer_vector(row))
        X = pd.DataFrame(X, index=raw_documents.index)
        return X

    def fit_transform(self, raw_documents, y=None):
        self.fit(raw_documents)
        return self.transform(raw_documents)
    

def my_scorer():
    

Set up GridSearch. Helpful [tutorial](https://medium.com/datadriveninvestor/an-introduction-to-grid-search-ff57adcc0998).

In [91]:
gsc = GridSearchCV(estimator=SVR(kernel='rbf'),
                   param_grid=parameters,
                   cv=5, 
                   scoring=my_scorer, 
                   verbose=0, 
                   n_jobs=-1)

gsc.fit(x, y)
gsc.

Train the model. Do I need to include seed? Ask this question.

In [22]:
docs = DocumentIterator(filenames)
model = gensim.models.doc2vec.Doc2Vec(vector_size=300,
                               alpha=0.025,
                               min_alpha=0.001, 
                               min_count=1,
                workers=mp.cpu_count(),
                dm=1,
                epochs=10)

# Build vocabulary
start = timer()
model.build_vocab(docs)
end = timer()
print('Built vocabulary!')
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

# Train the model
start = timer()
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model')
print('Model Saved!')
end = timer()
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #10000, processed 1916218 words (160964/s), 25519 word types, 10000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #20000, processed 3907934 words (130450/s), 34596 word types, 20000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #30000, processed 5907800 words (107335/s), 41480 word types, 30000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #40000, processed 7932845 words (104353/s), 47431 word types, 40000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #50000, processed 9971567 words (115752/s), 52608 word types, 50000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #60000, processed 12030453 words (109059/s), 57253 word types, 60000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #70000, processed 14122464 words (121748/s), 61703 word types,

Built vocabulary!
Execution time HH:MM:SS: 0:04:20.797565


INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 0.49% examples, 95935 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 1.05% examples, 102054 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 1.69% examples, 109040 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 2.31% examples, 111962 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 2.82% examples, 110299 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 3.53% examples, 114072 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 4.20% examples, 116685 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 4.87% examples, 118889 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 5.53% examples, 120382 words/s, in_qsize 0, out_qsize 0
IN

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.15% examples, 131215 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.87% examples, 131519 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 48.58% examples, 131540 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 49.28% examples, 131883 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 49.99% examples, 132168 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 50.74% examples, 132495 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 51.43% examples, 132671 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 52.13% examples, 132921 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 52.85% examples, 133218 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 99.84% examples, 144297 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 1 : training on 28915028 raw words (22150756 effective words) took 153.5s, 144342 effective words/s
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 0.79% examples, 155587 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 1.62% examples, 158312 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 2.34% examples, 154914 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 49.79% examples, 150778 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 50.54% examples, 150916 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 51.29% examples, 151031 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 52.00% examples, 151096 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 52.71% examples, 151196 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 53.45% examples, 151316 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 54.21% examples, 151523 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 54.87% examples, 151505 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.56% examples, 151589 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH - 2 : training on 28915028 raw words (22147510 effective words) took 143.0s, 154893 effective words/s
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 0.79% examples, 155672 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 1.62% examples, 158431 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 2.41% examples, 158986 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 3.23% examples, 159826 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 3.90% examples, 154082 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 4.69% examples, 154509 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 5.14% examples, 144070 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 5.78% examples, 142982 words

INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 53.18% examples, 150742 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 53.92% examples, 150937 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 54.64% examples, 151119 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 55.30% examples, 151086 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 56.06% examples, 151306 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 56.78% examples, 151363 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 57.53% examples, 151462 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 58.27% examples, 151563 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 58.99% examples, 151603 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 3.19% examples, 159451 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 3.98% examples, 157753 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 4.80% examples, 157957 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 5.60% examples, 158429 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 6.40% examples, 158635 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 7.23% examples, 159712 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 8.02% examples, 159424 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 8.76% examples, 157942 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 9.38% examples, 154816 words/s, in_qsize 0, out_qsize 0
I

INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 57.05% examples, 154492 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 57.77% examples, 154538 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 58.48% examples, 154536 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 59.22% examples, 154641 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 59.93% examples, 154692 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 60.54% examples, 154390 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 61.08% examples, 153854 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 61.77% examples, 153848 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 4 - PROGRESS: at 62.53% examples, 154008 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 0.60% examples, 118894 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 1.24% examples, 122947 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 1.90% examples, 125430 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 2.52% examples, 125415 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 3.16% examples, 125120 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 3.82% examples, 125350 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 4.46% examples, 125091 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 5.10% examples, 125299 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 5.75% examples, 126106 words/s, in_qsize 0, out_qsize 0
I

INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 45.09% examples, 124867 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 45.70% examples, 124936 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 46.32% examples, 125037 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 46.91% examples, 125133 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 47.59% examples, 125411 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 48.27% examples, 125670 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 48.79% examples, 125493 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 49.38% examples, 125609 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 49.99% examples, 125728 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 90.58% examples, 130097 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 91.20% examples, 130184 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 91.84% examples, 130216 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 92.52% examples, 130318 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 93.11% examples, 130353 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 93.77% examples, 130442 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 94.42% examples, 130534 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 95.01% examples, 130562 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 95.66% examples, 130640 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 33.39% examples, 127690 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 34.01% examples, 127748 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 34.65% examples, 127847 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 35.24% examples, 127837 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 35.82% examples, 127823 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 36.45% examples, 127866 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 36.99% examples, 127653 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 37.56% examples, 127664 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 38.17% examples, 127771 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 77.34% examples, 129052 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 77.88% examples, 128970 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 78.49% examples, 129012 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 79.11% examples, 129039 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 79.73% examples, 129071 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 80.37% examples, 129143 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 80.91% examples, 129049 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 81.47% examples, 128986 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 6 - PROGRESS: at 82.09% examples, 129042 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 19.17% examples, 124992 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 19.79% examples, 125245 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 20.41% examples, 125365 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 20.92% examples, 124829 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 21.57% examples, 125197 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 22.23% examples, 125480 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 22.85% examples, 125533 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 23.50% examples, 125599 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 24.15% examples, 125787 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 63.39% examples, 126697 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 63.99% examples, 126751 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 64.57% examples, 126835 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 65.19% examples, 126938 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 65.78% examples, 127001 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 66.32% examples, 126917 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 66.95% examples, 127032 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 67.55% examples, 127075 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 7 - PROGRESS: at 68.16% examples, 127126 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 5.06% examples, 123906 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 5.60% examples, 122604 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 6.22% examples, 122983 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 6.87% examples, 123687 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 7.48% examples, 123996 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 8.10% examples, 123466 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 8.69% examples, 122802 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 9.31% examples, 122781 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 9.93% examples, 123449 words/s, in_qsize 0, out_qsize 0
I

INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 50.16% examples, 127934 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 50.78% examples, 127981 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 51.39% examples, 127999 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 52.00% examples, 128090 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 52.61% examples, 128159 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 53.21% examples, 128226 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 53.79% examples, 128234 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 54.38% examples, 128264 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 54.94% examples, 128221 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 94.95% examples, 130484 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 95.46% examples, 130341 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 96.07% examples, 130369 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 96.74% examples, 130466 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 97.36% examples, 130519 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 97.83% examples, 130359 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 98.38% examples, 130334 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 98.98% examples, 130357 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 99.54% examples, 130346 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 37.15% examples, 126047 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 37.79% examples, 126248 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 38.42% examples, 126368 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 39.01% examples, 126372 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 39.64% examples, 126553 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 40.24% examples, 126551 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 40.81% examples, 126435 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 41.46% examples, 126618 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 42.07% examples, 126727 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 82.48% examples, 130512 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 83.09% examples, 130593 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 83.74% examples, 130689 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 84.37% examples, 130768 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 85.00% examples, 130852 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 85.68% examples, 130957 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 86.33% examples, 131059 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 86.97% examples, 131132 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 87.61% examples, 131118 words/s, in_qsize 0, out_

INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 25.22% examples, 125573 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 25.85% examples, 125749 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 26.46% examples, 125945 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 27.10% examples, 126171 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 27.75% examples, 126420 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 28.38% examples, 126640 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 29.02% examples, 126928 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 29.64% examples, 127006 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 30.28% examples, 127275 words/s, in_qsiz

INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 73.28% examples, 136576 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 73.96% examples, 136715 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 74.65% examples, 136895 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 75.28% examples, 136952 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 76.02% examples, 137159 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 76.75% examples, 137382 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 77.31% examples, 137265 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 77.98% examples, 137385 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 78.77% examples, 137673 words/s, in_qsiz

FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model.trainables.syn1neg.npy'

In [24]:
model.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model')
print('Model Saved!')
end = timer()
print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))

INFO:gensim.utils:saving Doc2Vec object under /Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model, separately None
INFO:gensim.utils:storing np array 'syn1neg' to /Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model.trainables.syn1neg.npy
INFO:gensim.utils:storing np array 'vectors' to /Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model.wv.vectors.npy
INFO:gensim.utils:storing np array 'vectors_docs' to /Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model.docvecs.vectors_docs.npy
INFO:gensim.utils:saved /Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/d2v.model


Model Saved!
Execution time HH:MM:SS: 1:40:36.579304


## Evaluate

In [183]:
metadata

Unnamed: 0,identifier,filename,spec,title,datestamp,created,updated,authors,categories,journal,doi,abstract,comments,date_retrieved,filename_parsed
0,oai:arXiv.org:0704.0009,0704.0009,physics:astro-ph,"The Spitzer c2d Survey of Large, Nearby, Inste...",2010-03-18,2007-04-02,,"Paul Harvey, Bruno Merin, Tracy L. Huard, Luis...",astro-ph,"Astrophys.J.663:1149-1173,2007",10.1086/518646,We discuss the results from the combined IRA...,,2020-03-07 02:10:36.300743,0704.0009
1,oai:arXiv.org:0704.0017,0704.0017,physics:astro-ph,Spectroscopic Observations of the Intermediate...,2009-06-23,2007-03-31,,"Nceba Mhlahlo, David H. Buckley, Vikram S. Dhi...",astro-ph,"Mon.Not.Roy.Astron.Soc.378:211-220,2007",10.1111/j.1365-2966.2007.11762.x,Results from spectroscopic observations of t...,,2020-03-07 02:10:36.300743,0704.0017
2,oai:arXiv.org:0704.0023,0704.0023,physics:astro-ph,ALMA as the ideal probe of the solar chromosphere,2009-06-23,2007-03-31,,"M. A. Loukitcheva, S. K. Solanki, S. White",astro-ph,"Astrophys.Space Sci.313:197-200,2008",10.1007/s10509-007-9626-1,"The very nature of the solar chromosphere, i...",,2020-03-07 02:10:36.300743,0704.0023
3,oai:arXiv.org:0704.0044,0704.0044,physics:astro-ph,Astrophysical gyrokinetics: kinetic and fluid ...,2015-05-13,2007-03-31,2009-05-09,"A. A. Schekochihin, S. C. Cowley, W. Dorland, ...",astro-ph nlin.CD physics.plasm-ph physics.spac...,"ApJS 182, 310 (2009)",10.1088/0067-0049/182/1/310,We present a theoretical framework for plasm...,,2020-03-07 02:10:36.300743,0704.0044
4,oai:arXiv.org:0704.0048,0704.0048,physics:astro-ph,Inference on white dwarf binary systems using ...,2008-11-26,2007-03-31,2007-04-03,"Alexander Stroeer, John Veitch, Christian Roev...",gr-qc astro-ph,"Class.Quant.Grav.24:S541-S550,2007",10.1088/0264-9381/24/19/S17,We report on the analysis of selected single...,,2020-03-07 02:10:36.300743,0704.0048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267789,oai:arXiv.org:quant-ph/9903043,quant-ph/9903043,physics:astro-ph,A Possible Anisotropy in Blackbody Radiation V...,2007-05-23,1999-03-12,1999-06-07,T K Rai Dastidar,quant-ph astro-ph hep-th,Mod.Phys.Lett. A14 (1999) 1193-1198,,A non-local gauge symmetry of a complex scal...,,2020-03-07 02:10:36.300743,quant-ph9903043
267790,oai:arXiv.org:quant-ph/9903053,quant-ph/9903053,physics:astro-ph,Father Time. I. Does the Cosmic Microwave Back...,2009-10-31,1999-03-15,1999-10-21,T K Rai Dastidar,quant-ph astro-ph hep-th,Mod.Phys.Lett. A14 (1999) 2499-2505,10.1142/S0217732399002601,The existence of a non-thermodynamic arrow o...,,2020-03-07 02:10:36.300743,quant-ph9903053
267791,oai:arXiv.org:quant-ph/9907088,quant-ph/9907088,physics:astro-ph,On Bures fidelity of displaced squeezed therma...,2008-12-18,1999-07-27,,"Xiang-Bin Wang, C. H. Oh, L. C. Kwek",quant-ph astro-ph,"Phys.Rev.A58:4186,1998",10.1103/PhysRevA.58.4186,Fidelity plays a key role in quantum informa...,,2020-03-07 02:10:36.300743,quant-ph9907088
267792,oai:arXiv.org:solv-int/9404002,solv-int/9404002,physics:astro-ph,Dynamical Systems Accepting the Normal Shift,2008-02-03,1993-04-07,,R. A. Sharipov,solv-int alg-geom astro-ph gr-qc hep-th math.A...,,,Newtonian dynamical systems accepting the no...,,2020-03-07 02:10:36.300743,solv-int9404002


In [47]:
metadata = pd.read_csv('/Volumes/BRIENNAKH/Thesis/data/2020_03_06_arxiv_metadata_astroph/arxiv_metadata_astroph.csv',
                       dtype={'filename': str,
                              'filename_parsed': str,
                              'identifier': str,
                              'updated': str,
                              'doi': str}, 
                       parse_dates=['date_retrieved'])

data = metadata[metadata['filename_parsed'].isin([os.path.splitext(os.path.basename(x))[0] for x in filenames])]
data

Unnamed: 0,identifier,filename,spec,title,datestamp,created,updated,authors,categories,journal,doi,abstract,comments,date_retrieved,filename_parsed
19942,oai:arXiv.org:0901.0104,0901.0104,physics:astro-ph,"PIERNIK mhd code - a multi-fluid, non-ideal ex...",2013-07-25,2009-01-02,,"Michał Hanasz, Kacper Kowalik, Dominik Wóltańs...",astro-ph.GA,,10.1051/eas/1256060,"We present a new multi-fluid, grid MHD code ...",,2020-03-07 02:10:36.300743,0901.0104
19947,oai:arXiv.org:0901.0133,0901.0133,physics:astro-ph,A Limit on the Polarized Anomalous Microwave E...,2010-01-15,2009-01-02,2009-02-12,"B. S. Mason, T. Robishaw, C. Heiles, D. Finkbe...",astro-ph.GA,"Astrophys.J.697:1187-1193,2009",10.1088/0004-637X/697/2/1187,The dark cloud Lynds 1622 is one of a few sp...,,2020-03-07 02:10:36.300743,0901.0133
19951,oai:arXiv.org:0901.0167,0901.0167,physics:astro-ph,Blazar nuclei in radio-loud narrow-line Seyfer...,2009-11-13,2009-01-01,,"L. Foschini, L. Maraschi, F. Tavecchio, G. Ghi...",astro-ph.HE,,10.1016/j.asr.2008.12.021,It has been suggested that some radio-loud n...,,2020-03-07 02:10:36.300743,0901.0167
19952,oai:arXiv.org:0901.0173,0901.0173,physics:astro-ph,Non-Minimal Quintessence With Nearly Flat Pote...,2010-04-14,2009-01-01,,"Anjan A Sen, Gaveshna Gupta, Sudipta Das",astro-ph.CO,"JCAP 0909:027,2009",10.1088/1475-7516/2009/09/027,We consider Brans-Dicke type nonminimally co...,,2020-03-07 02:10:36.300743,0901.0173
19953,oai:arXiv.org:0901.0178,0901.0178,physics:astro-ph,Secular evolution and the assembly of bulges,2009-01-21,2009-01-01,,F. Combes,astro-ph.GA,,,"Bulges are of different types, morphologies ...",,2020-03-07 02:10:36.300743,0901.0178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179551,oai:arXiv.org:2001.00099,2001.00099,physics:astro-ph,Galaxies hosting an AGN: a view from the CALIF...,2020-01-22,2019-12-31,,"Eduardo A. D. Lacerda, Sebastián F. Sánchez, R...",astro-ph.GA,,10.1093/mnras/staa008,We study the presence of optically-selected ...,,2020-03-07 02:10:36.300743,2001.00099
179552,oai:arXiv.org:2001.00125,2001.00125,physics:astro-ph,Size and Shape Constraints of (486958) Arrokot...,2020-03-04,2019-12-31,,"Marc W. Buie, Simon B. Porter, Peter Tamblyn, ...",astro-ph.EP astro-ph.IM,,10.3847/1538-3881/ab6ced,We present the results from four stellar occ...,,2020-03-07 02:10:36.300743,2001.00125
179553,oai:arXiv.org:2001.00133,2001.00133,physics:astro-ph,Clumpy AGN outflows due to thermal instability,2020-01-03,2019-12-31,,"Randall Dannen, Daniel Proga, Tim Waters, Serg...",astro-ph.GA,,,One of the main mechanisms that could drive ...,,2020-03-07 02:10:36.300743,2001.00133
179724,oai:arXiv.org:2001.01763,2001.01763,physics:astro-ph,A study of the secondary cosmic $\gamma$ -ray ...,2020-01-08,2019-12-27,,"S. Roy, S. Biswas, S. Das, S. K. Ghosh, S. Raha",astro-ph.HE astro-ph.IM,,,We present the results from the measurement ...,,2020-03-07 02:10:36.300743,2001.01763


## Generate triplets

Check Dai, Olah, and Le's triplets to see how they are generated. B has any category from A, C shares no categories with A but may share categories with B. Follows what they said in their paper.

In [200]:
with open('/Volumes/BRIENNAKH/Thesis/data/2020_04_24_dai_olah_le_triplets/arxiv_2014_09_27_examples.txt', 'r') as file:
    daiolahle_triplets = [(line[0], line[1], line[2]) for line in (line.replace('\n', '').split(' ') for line in file.readlines())]

daiolahle_categories = []
for triplet in daiolahle_triplets:
    doc1_id = re.search(r'\w*-?\w*[/.]\d*$', triplet[0])[0].replace('/', '')
    doc2_id = re.search(r'\w*-?\w*[/.]\d*$', triplet[1])[0].replace('/', '')
    doc3_id = re.search(r'\w*-?\w*[/.]\d*$', triplet[2])[0].replace('/', '')
    cats1 = metadata[metadata['filename_parsed'] == doc1_id]['categories']
    cats2 = metadata[metadata['filename_parsed'] == doc2_id]['categories']
    cats3 = metadata[metadata['filename_parsed'] == doc3_id]['categories']
    if any(cats1) and any(cats2) and any(cats3):
        print((cats1.iloc[0], cats2.iloc[0], cats3.iloc[0]))
        # Check if C is in A or B
        print(any(np.isin(cats3.iloc[0].split(' '), cats1.iloc[0].split(' '))) or any(np.isin(cats3.iloc[0].split(' '), cats2.iloc[0].split(' '))))

('astro-ph.SR', 'astro-ph.HE astro-ph.SR gr-qc nucl-th', 'astro-ph.GA')
False
('astro-ph.SR', 'astro-ph.EP astro-ph.SR', 'math-ph astro-ph hep-th math.AG math.MP')
False
('astro-ph.HE', 'astro-ph.HE', 'astro-ph')
False
('astro-ph.GA', 'astro-ph.GA', 'astro-ph')
False
('astro-ph.SR', 'astro-ph.SR', 'astro-ph')
False
('astro-ph.SR', 'astro-ph.SR', 'astro-ph')
False
('astro-ph.HE', 'astro-ph.HE astro-ph.CO', 'astro-ph.CO')
True
('astro-ph', 'quant-ph astro-ph', 'astro-ph.CO')
False
('astro-ph', 'physics.plasm-ph astro-ph', 'astro-ph.HE')
False
('astro-ph', 'quant-ph astro-ph hep-th', 'astro-ph.SR')
False
('astro-ph.CO', 'astro-ph.CO', 'astro-ph.HE')
False
('astro-ph', 'physics.flu-dyn astro-ph physics.ao-ph', 'astro-ph.IM cs.DL')
False
('astro-ph.CO', 'astro-ph.SR astro-ph.CO astro-ph.HE', 'astro-ph')
False
('astro-ph.CO gr-qc hep-ph hep-th', 'hep-th astro-ph.CO hep-ph', 'astro-ph.SR')
False
('astro-ph', 'quant-ph astro-ph nlin.CD', 'astro-ph.IM')
False
('astro-ph.GA', 'astro-ph.GA', 'ast

('astro-ph.SR', 'astro-ph.SR', 'astro-ph.CO hep-ex physics.ins-det')
False
('astro-ph.SR', 'astro-ph.SR', 'gr-qc astro-ph.CO hep-th math-ph math.MP')
False
('astro-ph', 'physics.plasm-ph astro-ph math-ph math.MP', 'astro-ph.EP astro-ph.SR')
False
('astro-ph.CO', 'gr-qc astro-ph.CO astro-ph.HE', 'astro-ph.GA')
False
('astro-ph', 'physics.geo-ph astro-ph physics.plasm-ph', 'astro-ph.CO')
False
('astro-ph.EP', 'astro-ph.EP', 'astro-ph.HE')
False
('astro-ph', 'physics.data-an astro-ph cond-mat', 'astro-ph.CO')
False
('astro-ph.HE', 'gr-qc astro-ph.HE', 'hep-th astro-ph gr-qc')
True
('astro-ph', 'solv-int astro-ph chao-dyn comp-gas nlin.CD nlin.CG nlin.SI', 'astro-ph.GA')
False
('astro-ph.HE', 'physics.flu-dyn astro-ph.HE', 'astro-ph.CO')
False
('astro-ph.IM', 'physics.optics astro-ph astro-ph.IM', 'astro-ph.SR')
False
('astro-ph', 'physics.atom-ph astro-ph gr-qc hep-ph nucl-th', 'astro-ph.EP')
False
('astro-ph.SR', 'astro-ph.SR', 'cs.DM astro-ph.IM math.OC')
False
('astro-ph.CO', 'astro-ph

('astro-ph', 'quant-ph astro-ph cond-mat.stat-mech gr-qc hep-th math-ph math.MP', 'astro-ph.IM')
False
('astro-ph.CO', 'astro-ph.CO', 'astro-ph')
False
('astro-ph.HE', 'astro-ph.SR astro-ph.CO astro-ph.HE', 'astro-ph.SR')
True
('astro-ph.EP astro-ph.SR', 'astro-ph.EP astro-ph.SR', 'hep-th astro-ph gr-qc hep-ph')
False
('astro-ph.IM', 'physics.optics astro-ph astro-ph.IM', 'astro-ph.CO astro-ph.GA astro-ph.HE')
False
('astro-ph', 'physics.data-an astro-ph', 'astro-ph.HE')
False
('astro-ph.CO', 'hep-ph astro-ph.CO hep-ex', 'astro-ph.GA')
False
('astro-ph.SR', 'astro-ph.SR', 'astro-ph')
False
('astro-ph.EP', 'astro-ph.EP astro-ph.IM', 'astro-ph')
False
('astro-ph.SR', 'astro-ph.SR', 'astro-ph')
False
('astro-ph', 'physics.plasm-ph adap-org astro-ph nlin.AO physics.space-ph', 'astro-ph.HE')
False
('astro-ph hep-ph', 'physics.plasm-ph astro-ph', 'astro-ph.SR')
False
('astro-ph.CO hep-ph hep-th', 'astro-ph.CO gr-qc hep-th', 'astro-ph')
False
('astro-ph.CO', 'astro-ph.CO', 'astro-ph.EP')
Fals

('astro-ph', 'physics.plasm-ph astro-ph', 'astro-ph.EP')
False
('astro-ph.EP astro-ph.SR', 'astro-ph.EP astro-ph.SR', 'astro-ph.GA')
False
('astro-ph.SR physics.space-ph', 'astro-ph.SR physics.flu-dyn physics.plasm-ph physics.space-ph', 'astro-ph.GA astro-ph.HE')
False
('astro-ph', 'physics.ins-det astro-ph nucl-ex', 'astro-ph.CO gr-qc')
False
('astro-ph', 'quant-ph astro-ph cond-mat.stat-mech gr-qc hep-th math-ph math.MP', 'gr-qc astro-ph.CO hep-ph')
True
('astro-ph.EP', 'astro-ph.EP astro-ph.IM astro-ph.SR math.NA', 'astro-ph nlin.CD physics.plasm-ph')
False
('astro-ph.EP', 'astro-ph.EP astro-ph.SR', 'astro-ph.HE astro-ph.CO hep-ph')
False
('astro-ph', 'physics.data-an astro-ph cond-mat.stat-mech physics.comp-ph q-fin.ST', 'astro-ph.CO')
False
('astro-ph.CO', 'astro-ph.GA astro-ph.CO', 'astro-ph gr-qc hep-ph')
False
('astro-ph.CO gr-qc hep-ex', 'hep-ph astro-ph.CO gr-qc', 'astro-ph.SR')
False
('astro-ph.IM astro-ph.EP', 'astro-ph.IM astro-ph.EP', 'hep-th astro-ph.CO hep-ph')
False
('

Try with small set of triplets. 

In [203]:
mini_triplets = [] # where B is chosen based on any of A's categories AND C is chosen on BOTH paper A and B's categories
mini_triplets2 = [] # where B is chosen based on any of A's categories AND C is chosen based on paper A's categories
        
data = metadata[metadata['filename_parsed'].isin([os.path.splitext(os.path.basename(x))[0] for x in filenames[:1000]])]
while len(mini_triplets) < 100:
    try:
        mini_triplets.append(generate_triplet())
        print('Number of triplets: ' + str(len(mini_triplets)))
    except Exception as e: # If we end up with too few choices for C, catch exception and reselect A
        print(e)

print(evaluate_mini_triplets(mini_triplets))


Randomly chose Paper A from 1000 eligible papers...
Paper A: 0901.1213
Categories: ['astro-ph.CO']
Randomly chose Paper B from 302 eligible papers...
Paper B: 0901.3434
Categories: ['astro-ph.CO']
Randomly chose Paper C from 697 eligible papers...
Paper C: 0901.1355
Categories: ['astro-ph.GA']
True
Number of triplets: 1

Randomly chose Paper A from 997 eligible papers...
Paper A: 0901.2122
Categories: ['astro-ph.CO']
Randomly chose Paper B from 300 eligible papers...
Paper B: 0901.4545
Categories: ['astro-ph.CO']
Randomly chose Paper C from 696 eligible papers...
Paper C: 0902.0555
Categories: ['astro-ph.GA']
True
Number of triplets: 2

Randomly chose Paper A from 994 eligible papers...
Paper A: 0901.3792
Categories: ['astro-ph.CO']
Randomly chose Paper B from 298 eligible papers...
Paper B: 0901.0415
Categories: ['astro-ph.GA', 'astro-ph.CO']
Randomly chose Paper C from 695 eligible papers...
Paper C: 0901.0914
Categories: ['astro-ph.GA']
True
Number of triplets: 3

Randomly chose Pa

Randomly chose Paper C from 649 eligible papers...
Paper C: 0901.3281
Categories: ['astro-ph.SR']
True
Number of triplets: 26

Randomly chose Paper A from 922 eligible papers...
Paper A: 0901.2325
Categories: ['astro-ph.SR']
Randomly chose Paper B from 266 eligible papers...
Paper B: 0901.2119
Categories: ['astro-ph.SR']
Randomly chose Paper C from 655 eligible papers...
Paper C: 0901.0989
Categories: ['astro-ph.CO']
True
Number of triplets: 27

Randomly chose Paper A from 919 eligible papers...
Paper A: 0901.2952
Categories: ['astro-ph.IM', 'astro-ph.CO']
Randomly chose Paper B from 321 eligible papers...
Paper B: 0901.1888
Categories: ['astro-ph.CO']
Randomly chose Paper C from 597 eligible papers...
Paper C: 0901.4093
Categories: ['astro-ph.SR', 'astro-ph.EP']
True
Number of triplets: 28

Randomly chose Paper A from 916 eligible papers...
Paper A: 0901.4026
Categories: ['astro-ph.IM', 'astro-ph.HE']
Randomly chose Paper B from 238 eligible papers...
Paper B: 0901.1133
Categories: ['

Randomly chose Paper C from 587 eligible papers...
Paper C: 0901.1593
Categories: ['astro-ph.SR']
True
Number of triplets: 53

Randomly chose Paper A from 841 eligible papers...
Paper A: 0901.3677
Categories: ['astro-ph.SR']
Randomly chose Paper B from 243 eligible papers...
Paper B: 0901.1767
Categories: ['astro-ph.SR']
Randomly chose Paper C from 597 eligible papers...
Paper C: 0901.3269
Categories: ['astro-ph.CO']
True
Number of triplets: 54

Randomly chose Paper A from 838 eligible papers...
Paper A: 0901.0868
Categories: ['astro-ph.CO', 'astro-ph.GA']
Randomly chose Paper B from 443 eligible papers...
Paper B: 0902.0647
Categories: ['astro-ph.GA', 'astro-ph.EP']
Randomly chose Paper C from 394 eligible papers...
Paper C: 0902.0301
Categories: ['astro-ph.SR']
True
Number of triplets: 55

Randomly chose Paper A from 835 eligible papers...
Paper A: 0901.2248
Categories: ['astro-ph.CO']
Randomly chose Paper B from 243 eligible papers...
Paper B: 0901.1941
Categories: ['astro-ph.CO']
R

Randomly chose Paper B from 133 eligible papers...
Paper B: 0901.0254
Categories: ['astro-ph.HE']
Randomly chose Paper C from 632 eligible papers...
Paper C: 0901.0178
Categories: ['astro-ph.GA']
True
Number of triplets: 79

Randomly chose Paper A from 763 eligible papers...
Paper A: 0901.4338
Categories: ['astro-ph.SR', 'astro-ph.CO']
Randomly chose Paper B from 438 eligible papers...
Paper B: 0901.3146
Categories: ['astro-ph.SR', 'astro-ph.IM']
Randomly chose Paper C from 324 eligible papers...
Paper C: 0901.1351
Categories: ['astro-ph.GA']
True
Number of triplets: 80

Randomly chose Paper A from 760 eligible papers...
Paper A: 0901.0820
Categories: ['astro-ph.SR']
Randomly chose Paper B from 217 eligible papers...
Paper B: 0901.0568
Categories: ['astro-ph.SR']
Randomly chose Paper C from 542 eligible papers...
Paper C: 0901.3826
Categories: ['astro-ph.HE', 'astro-ph.GA']
True
Number of triplets: 81

Randomly chose Paper A from 757 eligible papers...
Paper A: 0901.1138
Categories: ['

In [202]:
def generate_triplet():
    '''Generates a triplet by randomly selecting 
        - an anchor paper A
        - a paper B that shares any of its categories with A  
        - a paper C which shares no categories with A
    
    Returns a tuple (A, B, C) where each item is the paper's arXiv ID/filename.'''
    # Select Paper A
    a_id = random.choice(data['filename_parsed'].tolist())
    a = data[data['filename_parsed'] == a_id]
    print('\nRandomly chose Paper A from ' + str((len(data))) + ' eligible papers...')
    print('Paper A: ' + a_id)
    a_categories = [x for x in a['categories'].iloc[0].split(' ') if x in acceptable_categories]
    print('Categories: ' + str(a_categories))
    data.drop(a.index, inplace=True) # Remove A from data
    
    # Select Paper B
    b_set = data[data['categories'].str.split(' ').apply(lambda x: np.any(np.isin(x, a_categories)))]
    b_id = random.choice(b_set['filename_parsed'].tolist())
    b = data[data['filename_parsed'] == b_id]
    print('Randomly chose Paper B from ' + str(len(b_set)) + ' eligible papers...')
    print('Paper B: ' + b_id)
    b_categories = [x for x in b['categories'].iloc[0].split(' ') if x in acceptable_categories]
    print('Categories: ' + str(b_categories))
        
    # Select Paper C
    papers_with_a_categories = data[data['categories'].str.split(' ').apply(lambda x: np.any(np.isin(x, a_categories)))]
    c_set = data.merge(papers_with_a_categories, how='outer', indicator=True).loc[lambda x: x['_merge']=='left_only'] 
    c_id = random.choice(c_set['filename_parsed'].tolist())
    c = data[data['filename_parsed'] == c_id]
    print('Randomly chose Paper C from ' + str(len(c_set)) + ' eligible papers...')
    print('Paper C: ' + c_id)
    c_categories = [x for x in c['categories'].iloc[0].split(' ') if x in acceptable_categories]
    print('Categories: ' + str(c_categories))
    
    print(len(papers_with_a_categories) + len(c_set) == len(data))
    data.drop([b.index[0], c.index[0]], inplace=True)
    return (a_id, b_id, c_id)


def get_vector(doc_id):
    '''Takes a Pandas row for a paper.'''
    loaded = np.load('/Volumes/BRIENNAKH/Thesis/data/2020_06_09_abstract_tokens/' + doc_id + '.npy')
    return model.infer_vector(loaded)


def calculate_accuracy(triplet):
    '''Calculates accuracy of one given triplet using two formulas.
    [ab > ac, ab > bc]. 1= yes, 0 = no.'''
    a_vector = get_vector(triplet[0])
    b_vector = get_vector(triplet[1])
    c_vector = get_vector(triplet[2])
    ab_dist = cosine_distance(a_vector, b_vector)
    ac_dist = cosine_distance(a_vector, c_vector)
    bc_dist = cosine_distance(b_vector, c_vector)
    if ab_dist > ac_dist and ab_dist > bc_dist:
        return [1, 1]
    elif ab_dist > ac_dist and ab_dist < bc_dist:
        return [1, 0]
    elif ab_dist < ac_dist and ab_dist > bc_dist:
        return [0, 1]
    else:
        return [0, 0]
    

def evaluate(triplets):
    '''Calculates averaged accuracy of all given triplets.
    [accuracy that ab is more similar than ac, ab more similar than bc]'''
    accuracies = [0, 0]
    for triplet in mini_triplets:
        accuracy = calculate_accuracy(triplet)
        accuracies[0] += accuracy[0]
        accuracies[1] += accuracy[1]
    return [accuracies[0]/len(triplets), accuracies[1]/len(triplets)]

In [104]:
triplets = []

# Generage 20,000 triplets
while len(triplets) < 20000:
    try: 
        triplet = generate_triplet()
        triplets.append(triplet)
        print('Number of triplets: ' + str(len(triplets)))
    except Exception as e:
        print(e)


Randomly chose Paper A from 37082 eligible papers...
Paper A: 1412.4038
Categories: ['astro-ph.SR']
name 'b_set' is not defined

Randomly chose Paper A from 37081 eligible papers...
Paper A: 1812.04765
Categories: ['astro-ph.CO']
name 'b_set' is not defined

Randomly chose Paper A from 37080 eligible papers...
Paper A: 1405.4344
Categories: ['astro-ph.GA']
name 'b_set' is not defined

Randomly chose Paper A from 37079 eligible papers...
Paper A: 1710.02360
Categories: ['astro-ph.GA']
name 'b_set' is not defined

Randomly chose Paper A from 37078 eligible papers...
Paper A: 1605.01163
Categories: ['astro-ph.SR']
name 'b_set' is not defined

Randomly chose Paper A from 37077 eligible papers...
Paper A: 1511.03304
Categories: ['astro-ph.GA']
name 'b_set' is not defined

Randomly chose Paper A from 37076 eligible papers...
Paper A: 1503.03516
Categories: ['astro-ph.EP']
name 'b_set' is not defined

Randomly chose Paper A from 37075 eligible papers...
Paper A: 1507.00259
Categories: ['astr

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

INFO:root:
Unfortunately, your original traceback can not be constructed.



Traceback (most recent call last):
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-104-74167dab7499>", line 6, in <module>
    triplet = generate_triplet()
  File "<ipython-input-103-c63dfd9633c4>", line 21, in generate_triplet
    b_set_to_choose_from = data[data['categories'].str.split(' ').apply(lambda x: x[0] == a_categories[0])]
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/pandas/core/strings.py", line 1843, in wrapper
    return func(self, *args, **kwargs)
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/pandas/core/strings.py", line 2561, in split
    result = str_split(self._parent, pat, n=n)
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/pandas/core/strings.py", line 1388, in str_split
    res = _na_map(f, arr)
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/pandas/co

KeyboardInterrupt: 

Use the same set of triplets for every evaluation.

In [52]:
np.save('/Volumes/BRIENNAKH/Thesis/results/2020_06_13_doc2vec/triplets_20000.npy', triplets)

In [56]:
evaluate(triplets)

0.40015

In [53]:
len(triplets)

20000

Are the inferred vectors close to the actual ones?

In [57]:
doc_id = random.choice(list(model.docvecs.doctags))
doc = np.load('/Volumes/BRIENNAKH/Thesis/data/2020_06_09_abstract_tokens/' + doc_id + '.npy')
inferred_docvec = model.infer_vector(doc)
print(doc_id)
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=5)))

INFO:gensim.models.keyedvectors:precomputing L2-norms of doc weight vectors


1701.08647
Doc2Vec(dm/m,d300,n5,w5,s0.001,t4):
 [('1701.08647', 0.6545212268829346), ('1905.02733', 0.41904380917549133), ('1903.08122', 0.4087505340576172), ('1707.07613', 0.402372807264328), ('1501.00815', 0.40232813358306885)]


In [58]:
analogy_results = model.wv.evaluate_word_analogies('/Volumes/BRIENNAKH/Thesis/data/2020_06_12_analogies/analogies.txt')
analogy_results[1][0]

INFO:gensim.models.keyedvectors:Evaluating word analogies for top 300000 words in the model on /Volumes/BRIENNAKH/Thesis/data/2020_06_12_analogies/analogies.txt
INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors
INFO:gensim.models.keyedvectors:solar-system: 0.0% (0/5)
INFO:gensim.models.keyedvectors:Quadruplets with out-of-vocabulary words: 0.0%
INFO:gensim.models.keyedvectors:NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknown=True"
INFO:gensim.models.keyedvectors:Total accuracy: 0.0% (0/5)


{'section': 'solar-system',
 'correct': [],
 'incorrect': [('PHOBOS', 'MARS', 'TITAN', 'SATURN'),
  ('DEIMOS', 'MARS', 'EUROPA', 'SATURN'),
  ('GRAVITATIONAL', 'GRAVITY', 'PLANETARY', 'PLANET'),
  ('GAS', 'GIANT', 'YELLOW', 'DWARF'),
  ('LIGHT', 'YEAR', 'MOON', 'PLANET')]}

In [66]:
print('Words in model: ' + str(len(model.wv.vocab.keys())))

Words in model: 87649


In [89]:
model.wv.most_similar('NASA')

[('JAXA', 0.6520100831985474),
 ('NSF', 0.5824324488639832),
 ('ESA', 0.5467125773429871),
 ('agency', 0.541905403137207),
 ('aerospace', 0.5296249389648438),
 ('NRC', 0.5135439038276672),
 ('rover', 0.5085702538490295),
 ('IPAC', 0.5075458288192749),
 ('giotto', 0.5042192339897156),
 ('japanese', 0.5027950406074524)]