In [1]:
import gensim
import os
import collections
import csv
import random
import sys
import numpy as np

In [2]:
DATADIR = os.getenv("DATADIR")
DATADIR

'/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/data/2018-07-10'

In [3]:
num_lines = sum(1 for line in open("/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/data/2018-07-10/clean_content.csv"))

In [4]:
num_lines

216616

In [5]:
csv.field_size_limit(sys.maxsize)

#reservoir sampling of rows in csv
desired_num_results = 100
chances_selected = desired_num_results / num_lines
 

# for line in csv.reader(os.path.join(DATADIR, "clean_content.csv")):
#     if random.random() < chances_selected:
#         result.append(line)
# print(result)
                       
        
def read_corpus(fname, tokens_only=False): 
    tag = -1
    with open(fname, "r", encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=",")

        for i, line in enumerate(reader):
            if random.random() < chances_selected:
                tag += 1
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[-1]), [tag])

In [6]:
train_corpus = list(read_corpus(os.path.join(DATADIR, "clean_content.csv")))

In [7]:
train_corpus

[TaggedDocument(words=['foreign', 'secretary', 'appalled', 'by', 'russia', 'and', 'china', 'decision', 'to', 'veto', 'resolution', 'on', 'syria', 'foreign', 'secretary', 'william', 'hague', 'said', 'russia', 'and', 'china', 'have', 'turned', 'their', 'back', 'on', 'the', 'people', 'of', 'syria', 'in', 'their', 'darkest', 'hour', 'responding', 'to', 'vetoing', 'of', 'un', 'security', 'council', 'resolution', 'on', 'syria', 'the', 'foreign', 'secretary', 'said', 'am', 'appalled', 'by', 'the', 'decision', 'of', 'russia', 'and', 'china', 'to', 'veto', 'the', 'draft', 'resolution', 'on', 'syria', 'at', 'the', 'united', 'nations', 'security', 'council', 'today', 'the', 'situation', 'in', 'syria', 'is', 'desperately', 'serious', 'and', 'it', 'is', 'getting', 'worse', 'by', 'the', 'day', 'earlier', 'this', 'week', 'met', 'syrian', 'refugees', 'in', 'jordan', 'whose', 'homes', 'had', 'been', 'destroyed', 'whose', 'family', 'members', 'had', 'been', 'killed', 'in', 'cold', 'blood', 'who', 'had',

# Training the Model
## Instantiate a Doc2Vec Object
Now, we'll instantiate a Doc2Vec model with a vector size with 50 words and iterating over the training corpus 40 times. We set the minimum word count to 2 in order to discard words with very few occurrences. (Without a variety of representative examples, retaining such infrequent words can often make a model worse!) Typical iteration counts in published 'Paragraph Vectors' results, using 10s-of-thousands to millions of docs, are 10-20. More iterations take more time and eventually reach a point of diminishing returns.

However, this is a very very small dataset (300 documents) with shortish documents (a few hundred words). Adding training passes can sometimes help with such small datasets.

In [8]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [9]:
model.build_vocab(train_corpus)

In [10]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 2.06 s, sys: 79.9 ms, total: 2.14 s
Wall time: 1.13 s


In [28]:
model.docvecs[1]

array([ 0.08710878,  0.25680408,  0.20844844, -0.2536233 , -0.06497974,
        0.35321963, -0.00461665,  0.36891994, -0.07859582,  0.08406328,
        0.02747476, -0.21570769,  0.37919727, -0.21756828, -0.01862241,
        0.05371958,  0.24100722,  0.11214324, -0.15249546,  0.01657007,
       -0.02129514, -0.11935986, -0.34590554, -0.11981395,  0.03856358,
       -0.10263199,  0.15266028, -0.16331732,  0.00114398,  0.44400862,
       -0.24358398,  0.54017454,  0.16051051, -0.51855594, -0.32533276,
        0.04566024, -0.15216166,  0.02702656, -0.57710344, -0.5842958 ,
        0.23433991, -0.19974917, -0.6759516 ,  0.27813157,  0.17225695,
       -0.23864342,  0.06403326, -0.43525916, -0.41542137, -0.16586122],
      dtype=float32)

# Inferring a Vector
One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [11]:
print(model.infer_vector(train_corpus[0].words))
print(train_corpus[0].tags)

[ 1.6517148e+00  1.3431126e+00  8.3163280e+00 -3.3289464e+00
  8.6738640e-01  3.5655731e-01 -2.1763024e+00  2.4900331e+00
 -6.3260841e+00  1.8264648e-01  1.8399144e+00 -4.0673747e+00
  3.0147827e+00  3.3331916e+00  5.5914679e+00 -1.4431496e+00
  2.2409313e+00 -5.2398181e+00 -7.0639610e+00 -9.8746158e-02
 -1.6679981e-01 -1.5101690e+00  2.1303995e+00  6.1860991e+00
  2.4029856e+00  1.0084819e+00  2.6555755e+00 -3.2605487e-01
  6.0154448e+00  2.2169659e+00 -1.0361555e+00  4.5863867e+00
 -7.0229783e-03  4.0510645e+00 -6.2931051e+00 -3.9656107e+00
  4.1494761e+00  8.4966439e-01 -3.7773020e+00 -6.1577892e+00
 -3.9067972e+00  4.1644297e+00 -2.6104834e+00  2.8337495e+00
  9.5867597e-02  3.9113576e+00  3.6735764e+00 -3.9412277e+00
  3.8093999e-01 -2.0565059e+00]
[0]


In [12]:
inferred_vector = model.infer_vector(train_corpus[0].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

  if np.issubdtype(vec.dtype, np.int):


In [13]:
sims[0][0]

0

In [14]:
train_corpus[0]

TaggedDocument(words=['foreign', 'secretary', 'appalled', 'by', 'russia', 'and', 'china', 'decision', 'to', 'veto', 'resolution', 'on', 'syria', 'foreign', 'secretary', 'william', 'hague', 'said', 'russia', 'and', 'china', 'have', 'turned', 'their', 'back', 'on', 'the', 'people', 'of', 'syria', 'in', 'their', 'darkest', 'hour', 'responding', 'to', 'vetoing', 'of', 'un', 'security', 'council', 'resolution', 'on', 'syria', 'the', 'foreign', 'secretary', 'said', 'am', 'appalled', 'by', 'the', 'decision', 'of', 'russia', 'and', 'china', 'to', 'veto', 'the', 'draft', 'resolution', 'on', 'syria', 'at', 'the', 'united', 'nations', 'security', 'council', 'today', 'the', 'situation', 'in', 'syria', 'is', 'desperately', 'serious', 'and', 'it', 'is', 'getting', 'worse', 'by', 'the', 'day', 'earlier', 'this', 'week', 'met', 'syrian', 'refugees', 'in', 'jordan', 'whose', 'homes', 'had', 'been', 'destroyed', 'whose', 'family', 'members', 'had', 'been', 'killed', 'in', 'cold', 'blood', 'who', 'had', 

In [15]:
type(train_corpus[0].tags[0])

int

In [16]:
for doc in train_corpus:
    if (doc.tags[0] == 135094):
            print(doc)

Note that infer_vector() does not take a string, but rather a list of string tokens, which should have already been tokenized the same way as the words property of original training document objects.

Also note that because the underlying training/inference algorithms are an iterative approximation problem that makes use of internal randomization, repeated inferences of the same text will return slightly different vectors.

# Assessing Model
To assess our new model, we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. Basically, we're pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that we've likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. Additionally, we'll keep track of the second ranks for a comparison of less similar documents.

In [17]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    found_itself_nearest = int(np.where(sims[0][0]==train_corpus[doc_id].tags[0], 1, 0))
#     rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(found_itself_nearest)
    
    second_ranks.append(sims[1])

Let's count how each document ranks with respect to the training corpus

In [18]:
collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus

Counter({1: 97, 0: 1})

Basically, 84/85 of the inferred documents are found to be most similar to itself and 1/85 it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

In [19]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print('({},{}),{})'.format(label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (97): «avid saving fuel and reducing emissions from vehicles innovate uk funding helps sustainable vehicle component supplier launch product and grow their business in the north east of england avid saving fuel and reducing emissions from vehicles ryan maughan left career building racing cars to set up business that could make difference he founded avid technology in to improve air quality by making vehicles more efficient avid design and develop powertrain components the parts of vehicles that generate and deliver power to the road specialising in electric and hybrid technology sales are set to increase over the next few years as automotive companies pursue greener products avid produce traction motors electric cooling pumps cooling fans and other electronic devices used in the vehicle drivetrain all essential on electric and hybrid vehicles they also manufacture components to improve the efficiency of conventional vehicles find out more about our work in manufacturing and ma

# Testing the Model
Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [20]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

NameError: name 'test_corpus' is not defined