In [None]:
import pandas as pd

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA  # for compactness

In [None]:
pd.set_option('max.columns', 999)

In [None]:
nltk.download('vader_lexicon')

In [None]:
raw_headlines = pd.read_csv('../data/abcnews_million_headlines.csv')

In [None]:
raw_headlines = raw_headlines.sample(1000)['headline'].values
raw_headlines[:10]

In [None]:
tagged_data = [TaggedDocument(words=nltk.word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(raw_headlines)]
tagged_data[:5]

In [None]:
# Here is a list of unique tags
# ...tagged_data[:1][0].tags[0]...

In [None]:
# Define a set of hyperparameters that can be optimized later
max_epochs = 100  # number of training epochs
alpha = 0.025     # initial learning rate, selected 

In [None]:
Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)

model = Doc2Vec(size=10,            # let's call it something like the number of neurons
                alpha=alpha,        # learning rate
                min_alpha=0.00025,  # minimum learning rate
                min_count=1,        # minimum term frequency
                dm =0.5)            # there is a trade off here in the degree of memory distribution to use for the model (i.e., DM v DBOW)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # Decrease the learning rate
    model.alpha -= 0.0002
    # Fix the learning rate to prevent decay
    model.min_alpha = model.alpha

In [None]:
# Recall this document in the corpus, the first entry in the random sampling performed above...
# ['boy', 'to', 'face', 'court', 'accused', 'of', 'knife', 'threats']
# A query might include a variation we construct as shown below

In [None]:
tokenized_query_statement = "young man accused of violent threats to appear before judge".split()
query_vector = model.infer_vector(tokenized_query_statement)
similarity = model.docvecs.most_similar([query_vector])
similarity

In [None]:
# Note the top match is index zero; success! :D