In [1]:
import gensim
import os
import collections
import csv
import random
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import progressbar
from tqdm import tnrange, tqdm_notebook
from gensim.test.utils import get_tmpfile

In [2]:
DATADIR = os.getenv("DATADIR")
DATADIR

'/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2018-11-29'

In [3]:
#create train test files
clean_content = pd.read_csv(os.path.join(DATADIR, "clean_content.csv"))

In [4]:
train, test = train_test_split(clean_content, random_state=1234, shuffle=True)

In [5]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

In [6]:
def tag_corpus(df, tokens_only=False): 
    for i, row in df.iterrows():
        if tokens_only:
                yield gensim.utils.simple_preprocess(row['combined_text'])
        else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(row['combined_text']), [i])

In [7]:
csv.field_size_limit(sys.maxsize)
                     
        
def read_corpus(fname, tokens_only=False): 
    
    with open(fname, "r", encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=",")

        for i, line in enumerate(reader):
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[-1]), [tag])

In [8]:
train_corpus = list(tag_corpus(train))

In [9]:
test_corpus = list(tag_corpus(test, tokens_only=True))

In [10]:
print('train length = {}'.format(len(train_corpus)))
print('test length = {}'.format(len(test_corpus)))

train length = 172347
test length = 57449


# Training the Model
## Instantiate a Doc2Vec Object
Now, we'll instantiate a Doc2Vec model with a vector size with 300 words and iterating over the training corpus 20 times. We set the minimum word count to 10 in order to discard words with very few occurrences. 

In [11]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=10, epochs=20)

In [12]:
model.build_vocab(train_corpus)

In [13]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 47min 54s, sys: 46.7 s, total: 48min 41s
Wall time: 18min 16s


# Inferring a Vector
One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [14]:
print(model.infer_vector(train_corpus[0].words))
print(train_corpus[0].tags)

[ 0.01619075 -0.3857366  -0.21332191 -0.39031467 -0.06365608 -0.06554236
  0.10489657  0.6586542  -0.67478514 -0.074596    0.40168247  0.06166789
 -0.05527627 -0.46808976 -0.2584152   0.42400724  0.0326782  -0.2494215
 -0.5367077  -0.8815892  -0.08493707  0.4654813   0.32676122  0.5447801
 -0.85339904  0.22020005 -0.8906548   0.18943048 -0.16608529 -0.41868275
 -0.3067872  -0.96009666  0.14968543 -0.00471213 -0.7331197   0.17063302
 -0.7028694  -0.4864514   0.3497148   0.23650299 -0.40131524  0.7863563
 -0.07368162  0.8567035   0.65980536 -0.4337881   0.08422016 -0.20582701
  0.53316045  0.00774368  1.5816195   0.8629716  -0.6574274   0.09455629
  0.42348886 -0.6951741   1.0440122  -0.10885935  0.26401392 -0.5317674
  1.0799328   0.44765794 -0.35996252 -0.79823446 -0.03307015  0.00412453
 -1.5467987  -0.19720659  0.63797146 -0.28328916 -0.3311835  -0.3550977
  0.3092764  -0.97142553  1.2182555   0.6001149  -0.06594264 -0.55086976
  0.5819013   0.50179195 -0.37025353 -0.9518246  -0.7533

Note that infer_vector() does not take a string, but rather a list of string tokens, which should have already been tokenized the same way as the words property of original training document objects.

Also note that because the underlying training/inference algorithms are an iterative approximation problem that makes use of internal randomization, repeated inferences of the same text will return slightly different vectors.

# Assessing Model
To assess our new model, we'll first infer new vectors for each document of the training corpus and see how often the document found itself tobe the nearest document. 

In [15]:
def evaluate_model(train_corpus, model):
    train_auto_nearest = []
    random.seed(1234)
    sample_1000 = random.sample(train_corpus, 1000)

    for doc_id in tqdm_notebook(range(len(sample_1000))):
        inferred_vector = model.infer_vector(sample_1000[doc_id].words)
        sims = model.docvecs.most_similar([inferred_vector], topn=2)
        found_itself_nearest = int(np.where(sims[0][0]==sample_1000[doc_id].tags[0], 1, 0))
        train_auto_nearest.append(found_itself_nearest)
    
    
    x = collections.Counter(train_auto_nearest)
    train_percent_auto_similar = x[1]/(x[0]+x[1])*100
    
    
    
    print("The percentage of 1000 training samples which found itself nearest = {}".format(train_percent_auto_similar
                                                                                          )
         )
    return train_percent_auto_similar 

In [16]:
evaluate_model(train_corpus, model)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

  if np.issubdtype(vec.dtype, np.int):



The percentage of 1000 training samples which found itself nearest = 90.7


90.7

Basically, 90% of the inferred documents are found to be most similar to itself and 10% it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

# Testing the Model
Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [17]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (36853): «re register your public limited company as private company rr this form is for public companies to re register as private limited company to re register from public company to private limited company you must have copy of the special resolution that the company should re register as private limited company unless previously delivered printed copy of the articles as proposed to be amended completed form rr forms need to be printed at full size on white sized paper rr application by public company for re registration as private limited company pdf kb pages»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n5,w5,mc10,s0.001,t3):

MOST (114, 0.7619693279266357): «re register an unlimited company as limited company rr use this form as an application by an unlimited company for re registration as private limited company this form can be used to apply to re register from an unlimited company to private limited company use this form if you have copy of the special r

In [18]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=10)

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (1377): «mr moore andrew richardson andrew richardson transport employment tribunal decision of judge burton on april read the full decision in mr moore andrew richardson andrew richardson transport»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n5,w5,mc10,s0.001,t3):

MOST (137170, 0.8314886093139648): «mr malinowski higgins transport ltd employment tribunal decision read the full decision in mr malinowski higgins transport ltd full»

SECOND (47132, 0.7978464961051941): «aa london borough of haringey ukut aac upper tribunal administrative appeals chamber decision by judge levenson on june read the full decision in hs judicial summary transport»



In [33]:
doc_id


1377

### train on everything

In [19]:
entire_corpus = list(tag_corpus(clean_content))

In [20]:
print('train length = {}'.format(len(entire_corpus)))

train length = 229796


In [21]:
model_full = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=10, epochs=20)

In [22]:
model_full.build_vocab(entire_corpus)

In [23]:
%time model_full.train(entire_corpus, total_examples=model_full.corpus_count, epochs=model_full.epochs)

CPU times: user 1h 13min 15s, sys: 59.5 s, total: 1h 14min 14s
Wall time: 26min 58s


In [24]:
evaluate_model(entire_corpus, model_full)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


The percentage of 1000 training samples which found itself nearest = 90.10000000000001


90.10000000000001

In [29]:
len(model_full.docvecs)

229796

In [37]:
cos_sims = model_full.docvecs.most_similar(43300, topn=len(model_full.docvecs))


In [48]:
model_full.docvecs.most_similar(43300, topn=20)


[(190814, 0.48058053851127625),
 (20805, 0.4803950786590576),
 (189785, 0.4770107865333557),
 (36838, 0.4495948851108551),
 (75127, 0.44929322600364685),
 (13921, 0.4489574730396271),
 (56909, 0.4386683702468872),
 (159593, 0.43493837118148804),
 (108195, 0.43441107869148254),
 (43435, 0.43156003952026367),
 (93623, 0.4309147894382477),
 (215078, 0.4253076910972595),
 (38654, 0.42307108640670776),
 (66787, 0.4158709943294525),
 (73335, 0.4151466488838196),
 (160178, 0.4100797474384308),
 (55447, 0.40952494740486145),
 (100133, 0.40927305817604065),
 (83182, 0.4069506525993347),
 (20797, 0.40627408027648926)]

In [42]:
clean_content['combined_text'].iloc[43300]

'government consults with civil society groups on un peacekeeping baroness anelay and general gordon messenger hosted an event to discuss key themes for the upcoming uk led peacekeeping defence ministerial on 7 8 september 2016 the uk will host the london peacekeeping defence ministerial to bring together defence ministers from across the world to discuss the need for meaningful improvements on how we participate in and effectively contribute to un peacekeeping operations. as part of the preparations for september ministerial meeting baroness anelay and vice chief of the defence staff general gordon messenger hosted a number of representatives from civil society at the foreign and commonwealth office on 18 july. these included the charities ngos academics and partners who work alongside peacekeepers across the world. the event enabled these groups to share their knowledge and expertise on reforming and improving un peacekeeping in particular their views on women and peacekeeping. speak

In [43]:
# Compare and print the most/median/least similar documents from the train corpus
# print('Test Document ({}): «{}»\n'.format('0', ' '.join(entire_corpus[0])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model_full)
for label, index in [('MOST', 0), ('SECOND', 1)]:
    print(u'%s %s: «%s»\n' % (label, cos_sims[index], ' '.join(entire_corpus[cos_sims[index][0]].words)))

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d512,n5,w5,mc10,s0.001,t3):

MOST (190814, 0.48058053851127625): «un peacekeeping defence ministerial london concept secretary of state for defence michael fallon will host un peacekeeping defence ministerial in london on september on september the un secretary general ban ki moon and us president barack obama joined nearly heads of states and representatives from around the world to make an unprecedented commitment to un peacekeeping at time when un peacekeepers are being called on to do even more nations and international organisations pledged an extra troops along with training equipment police contingents engineers intelligence units and helicopters one year on the uk together with the united nations is hosting follow up event at lancaster house in london to accelerate these global efforts bringing together defence ministers from countries those collectively responsible for the resources promised to the un the event purpose is threefol

In [46]:
len(entire_corpus)

229796

### generate a sample of urls to be spot checked

In [52]:
idx = np.random.randint(clean_content.shape[0], size=100)

In [53]:
clean_content['url'] = 'www.gov.uk'+clean_content['base_path']

In [57]:
corpus_sample = [entire_corpus[i] for i in idx]

In [58]:
len(corpus_sample)

100

In [63]:
idx

array([200158, 171112,  91391, 120631, 160635,  98358, 211862, 119266,
        75577, 103161, 183490,  52648,  72840, 219931,  16987, 151120,
        65620,  66882,  29941, 220769, 176329, 221844,    208, 183661,
         9076,  17928, 182272, 147179,  72287,  85745,  23314, 201222,
       145511,  27392,  40452, 138040, 195658,  22899, 213102, 203744,
       223830,  56927, 223168, 170676, 223360,  80469,   7528,  67802,
       125788,  46392, 217078, 113639,  56817, 134838,  79277,  69504,
        64179, 143912, 226163, 159255, 117819, 211365, 195426,  38427,
        25841, 142574, 215119, 152370, 153594,    342, 183041, 125326,
       205413, 152405, 101919,   8326, 131381, 100910, 215537,  58849,
       101667, 114473,  43223, 124625, 205846,  25111, 194058, 153438,
        87506,   3178,  60420,  64860,  37130, 160075, 167428, 212077,
        54817, 182203, 134238, 165623])

In [55]:
clean_content_sample = clean_content[clean_content.index.isin(idx)].copy()

In [62]:
cos_sims[0][0]

190814

In [103]:
urls = pd.DataFrame(columns=['source_url', 'suggested_links', 'cosine_sims'])

for i, tag_doc in enumerate(corpus_sample):
    cos_sims = model_full.docvecs.most_similar(i, topn=20)
    suggested_links_idx = [cos_sims[j][0] for j in range(20)]
    suggested_links_cos = pd.Series([cos_sims[k][1] for k in range(20)])
    target_urls = pd.Series(clean_content.url[clean_content.index.isin(suggested_links_idx)].copy())
    
    i_urls = pd.concat([target_urls, suggested_links_cos], axis=1, ignore_index=True)
    i_urls['source_url'] = clean_content_sample['url'].iloc[i]
    
    urls = urls.append(i_urls, ignore_index=True)
    

In [96]:
pd.concat([target_urls, suggested_links_cos], axis=1, ignore_index=True)

Unnamed: 0,0,1,2
0,4151,www.gov.uk/government/publications/statement-o...,0.653544
1,21815,www.gov.uk/government/publications/statement-o...,0.652838
2,40706,www.gov.uk/government/publications/smi-tp-19-i...,0.651748
3,52724,www.gov.uk/government/publications/yo25-8ej-fa...,0.651649
4,136600,www.gov.uk/government/collections/orphan-works...,0.651217
5,151077,www.gov.uk/government/publications/enhanced-de...,0.650575
6,156469,www.gov.uk/government/publications/sabto-repor...,0.649828
7,164488,www.gov.uk/government/publications/the-use-of-...,0.649196
8,164923,www.gov.uk/government/publications/ta11-6ew-ri...,0.648501
9,172105,www.gov.uk/government/publications/statement-o...,0.647681


In [91]:
urls

Unnamed: 0,source_url,suggested_links,cosine_sims
0,www.gov.uk/government/news/uk-assumes-chairman...,,0.653544
1,www.gov.uk/government/news/uk-assumes-chairman...,,0.652838
2,www.gov.uk/government/news/uk-assumes-chairman...,,0.651748
3,www.gov.uk/government/news/uk-assumes-chairman...,,0.651649
4,www.gov.uk/government/news/uk-assumes-chairman...,,0.651217
5,www.gov.uk/government/news/uk-assumes-chairman...,,0.650575
6,www.gov.uk/government/news/uk-assumes-chairman...,,0.649828
7,www.gov.uk/government/news/uk-assumes-chairman...,,0.649196
8,www.gov.uk/government/news/uk-assumes-chairman...,,0.648501
9,www.gov.uk/government/news/uk-assumes-chairman...,,0.647681


### Saving model

In [25]:
fname = get_tmpfile(os.path.join("/Volumes/GoogleDrive/Team Drives/GOV.UK teams/2018-2019/Q3/Knowledge up Q3/Data science/content_semantic_similarity/doc2vec_model", "doc2vec_model_2018-11-29.csv"))
model_full.save(fname)
#model = Doc2Vec.load(fname)  # you can continue training with the loaded model!