In [24]:
import gensim
import os
import collections
import csv
import random
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import progressbar
from tqdm import tnrange, tqdm_notebook
from gensim.test.utils import get_tmpfile

In [2]:
DATADIR = os.getenv("DATADIR")
DATADIR

'/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2018-11-29'

In [3]:
#create train test files
clean_content = pd.read_csv(os.path.join(DATADIR, "clean_content.csv"))

In [4]:
train, test = train_test_split(clean_content, random_state=1234, shuffle=True)

In [5]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

In [6]:
def tag_corpus(df, tokens_only=False): 
    for i, row in df.iterrows():
        if tokens_only:
                yield gensim.utils.simple_preprocess(row['combined_text'])
        else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(row['combined_text']), [i])

In [7]:
csv.field_size_limit(sys.maxsize)
                     
        
def read_corpus(fname, tokens_only=False): 
    
    with open(fname, "r", encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=",")

        for i, line in enumerate(reader):
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[-1]), [tag])

In [8]:
train_corpus = list(tag_corpus(train))

In [9]:
test_corpus = list(tag_corpus(test, tokens_only=True))

In [10]:
print('train length = {}'.format(len(train_corpus)))
print('test length = {}'.format(len(test_corpus)))

train length = 172347
test length = 57449


# Training the Model
## Instantiate a Doc2Vec Object
Now, we'll instantiate a Doc2Vec model with a vector size with 300 words and iterating over the training corpus 20 times. We set the minimum word count to 10 in order to discard words with very few occurrences. 

In [11]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=10, epochs=20)

In [12]:
model.build_vocab(train_corpus)

In [13]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 46min 39s, sys: 43.2 s, total: 47min 22s
Wall time: 17min 46s


# Inferring a Vector
One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [14]:
print(model.infer_vector(train_corpus[0].words))
print(train_corpus[0].tags)

[-8.75118792e-01 -8.95500124e-01  1.46018595e-01 -2.33786106e-01
  7.14360595e-01  5.96053481e-01 -1.11715816e-01  2.75579989e-01
  1.75319612e-01  6.63906217e-01  6.98049366e-01  6.32752329e-02
  1.33808032e-01  2.49586567e-01  3.76466811e-01 -2.74105996e-01
  2.22492069e-02  5.25327563e-01 -4.62037295e-01 -7.74631619e-01
  6.34893715e-01  1.02416885e+00 -1.29518449e+00 -5.53773046e-01
  1.99304491e-01 -1.87681839e-01  1.60499811e-01  4.52858508e-01
  7.72842541e-02  3.07472825e-01  3.06219697e-01 -4.93364990e-01
 -3.75496566e-01  1.33307531e-01 -1.87995821e-01 -7.74305165e-01
 -7.67212510e-01 -5.18019736e-01  7.95392275e-01 -1.22042978e+00
 -2.88125962e-01 -3.49138789e-02 -7.52498627e-01 -3.43506485e-01
  1.54965445e-01 -1.81098625e-01  5.05983472e-01  7.13778511e-02
 -5.67320228e-01 -1.59207523e-01  9.67527807e-01 -1.22648969e-01
 -1.09175898e-01 -9.75642279e-02  1.53443396e-01 -9.45146680e-01
  4.43371832e-01 -1.95925295e-01  4.54822332e-01 -1.74526796e-01
 -7.21875548e-01  3.56558

Note that infer_vector() does not take a string, but rather a list of string tokens, which should have already been tokenized the same way as the words property of original training document objects.

Also note that because the underlying training/inference algorithms are an iterative approximation problem that makes use of internal randomization, repeated inferences of the same text will return slightly different vectors.

# Assessing Model
To assess our new model, we'll first infer new vectors for each document of the training corpus and see how often the document found itself tobe the nearest document. 

In [38]:
def evaluate_model(train_corpus, model):
    train_auto_nearest = []
    random.seed(1234)
    sample_1000 = random.sample(train_corpus, 1000)

    for doc_id in tqdm_notebook(range(len(sample_1000))):
        inferred_vector = model.infer_vector(sample_1000[doc_id].words)
        sims = model.docvecs.most_similar([inferred_vector], topn=2)
        found_itself_nearest = int(np.where(sims[0][0]==sample_1000[doc_id].tags[0], 1, 0))
        train_auto_nearest.append(found_itself_nearest)
    
    
    x = collections.Counter(train_auto_nearest)
    train_percent_auto_similar = x[1]/(x[0]+x[1])*100
    
    
    
    print("The percentage of 1000 training samples which found itself nearest = {}".format(train_percent_auto_similar
                                                                                          )
         )
    return train_percent_auto_similar 

In [39]:
evaluate_model(train_corpus, model)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

The percentage of 1000 training samples which found itself nearest = 90.0


90.0

Basically, 90% of the inferred documents are found to be most similar to itself and 10% it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

# Testing the Model
Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [22]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (36853): «re register your public limited company as private company rr this form is for public companies to re register as private limited company to re register from public company to private limited company you must have copy of the special resolution that the company should re register as private limited company unless previously delivered printed copy of the articles as proposed to be amended completed form rr forms need to be printed at full size on white sized paper rr application by public company for re registration as private limited company pdf kb pages»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n5,w5,mc10,s0.001,t3):

MOST (111323, 0.7750102281570435): «re register public company as private unlimited company rr use this form as an application by public company for re registration as private unlimited company you can use this form as an application by public company for re registration as private unlimited company if you have prescribed form of assen

In [23]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=10)

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (1377): «mr moore andrew richardson andrew richardson transport employment tribunal decision of judge burton on april read the full decision in mr moore andrew richardson andrew richardson transport»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n5,w5,mc10,s0.001,t3):

MOST (137170, 0.8068290948867798): «mr malinowski higgins transport ltd employment tribunal decision read the full decision in mr malinowski higgins transport ltd full»

SECOND (47132, 0.7725715637207031): «aa london borough of haringey ukut aac upper tribunal administrative appeals chamber decision by judge levenson on june read the full decision in hs judicial summary transport»



### train on everything

In [32]:
entire_corpus = list(tag_corpus(clean_content))

In [33]:
print('train length = {}'.format(len(entire_corpus)))

train length = 229796


In [34]:
model_full = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=10, epochs=20)

In [35]:
model_full.build_vocab(entire_corpus)

In [36]:
%time model_full.train(entire_corpus, total_examples=model_full.corpus_count, epochs=model_full.epochs)

CPU times: user 1h 2min 38s, sys: 59.8 s, total: 1h 3min 38s
Wall time: 23min 57s


In [42]:
evaluate_model(entire_corpus, model_full)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

The percentage of 1000 training samples which found itself nearest = 90.2


90.2

### Saving model

In [41]:
fname = get_tmpfile(os.path.join("/Volumes/GoogleDrive/Team Drives/GOV.UK teams/2018-2019/Q3/Knowledge up Q3/Data science/content_semantic_similarity/doc2vec_model", "doc2vec_model_2018-11-29.csv"))
model_full.save(fname)
#model = Doc2Vec.load(fname)  # you can continue training with the loaded model!