In [81]:
import gensim
import os
import collections
import csv
import random
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import progressbar
from tqdm import tnrange, tqdm_notebook

import scipy
from scipy.spatial.distance import cosine

In [3]:
DATADIR = os.getenv("DATADIR")
DATADIR

'/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/data/2018-07-10'

In [4]:
#create train test files
clean_content = pd.read_csv(os.path.join(DATADIR, "clean_content.csv"))

In [5]:
train, test = train_test_split(clean_content, random_state=1234, shuffle=True)

In [6]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

In [7]:
def tag_corpus(df, tokens_only=False): 
    for i, row in df.iterrows():
        if tokens_only:
                yield gensim.utils.simple_preprocess(row['combined_text'])
        else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(row['combined_text']), [i])

In [8]:
csv.field_size_limit(sys.maxsize)
                     
        
def read_corpus(fname, tokens_only=False): 
    
    with open(fname, "r", encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=",")

        for i, line in enumerate(reader):
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line[-1]), [tag])

In [9]:
train_corpus = list(tag_corpus(train))

In [10]:
test_corpus = list(tag_corpus(test, tokens_only=True))

In [11]:
print('train length = {}'.format(len(train_corpus)))
print('test length = {}'.format(len(test_corpus)))

train length = 162461
test length = 54154


# Training the Model
## Instantiate a Doc2Vec Object
Now, we'll instantiate a Doc2Vec model with a vector size with 50 words and iterating over the training corpus 40 times. We set the minimum word count to 2 in order to discard words with very few occurrences. (Without a variety of representative examples, retaining such infrequent words can often make a model worse!) Typical iteration counts in published 'Paragraph Vectors' results, using 10s-of-thousands to millions of docs, are 10-20. More iterations take more time and eventually reach a point of diminishing returns.

However, this is a very very small dataset (300 documents) with shortish documents (a few hundred words). Adding training passes can sometimes help with such small datasets.

In [12]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [13]:
model.build_vocab(train_corpus)

In [14]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 1h 4min 52s, sys: 1min 38s, total: 1h 6min 31s
Wall time: 28min 2s


# Inferring a Vector
One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [15]:
print(model.infer_vector(train_corpus[0].words))
print(train_corpus[0].tags)

[ 1.2517213   0.9094943  -1.1385113  -0.36466655  0.4678414   0.28444976
 -1.827331   -1.9259272  -3.7183578  -0.43728885  1.1810168  -0.16709264
 -1.8279032   1.5660398   1.8666711   1.4020777  -3.0925753  -0.10257131
 -4.2863574  -0.16613753 -1.2092037   0.6984667  -1.5275509   2.1593678
 -3.244988   -1.767615    1.693676    3.4502897   0.99700814 -1.5359992
 -1.2812653  -0.10245395  3.428569   -0.33475244  0.31088027  1.4154533
  2.3830366   2.187225   -1.2447301  -3.0159519  -0.35021606  1.5770423
 -0.6752971   0.8625183  -2.349865    2.0543902  -0.54309756 -1.5427512
  2.5500095  -1.0086297 ]
[0]


Note that infer_vector() does not take a string, but rather a list of string tokens, which should have already been tokenized the same way as the words property of original training document objects.

Also note that because the underlying training/inference algorithms are an iterative approximation problem that makes use of internal randomization, repeated inferences of the same text will return slightly different vectors.

# Assessing Model
To assess our new model, we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. Basically, we're pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that we've likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. Additionally, we'll keep track of the second ranks for a comparison of less similar documents.

In [16]:
ranks = []
second_ranks = []

for doc_id in progressbar.progressbar(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    found_itself_nearest = int(np.where(sims[0][0]==train_corpus[doc_id].tags[0], 1, 0))
#     rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(found_itself_nearest)
    
    second_ranks.append(sims[1])

                                                                               N/A% (0 of 162461) |                     | Elapsed Time: 0:00:00 ETA:  --:--:--

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11 µs


  if np.issubdtype(vec.dtype, np.int):
100% (162461 of 162461) |###| Elapsed Time: 1 day, 0:13:00 Time: 1 day, 0:13:00


Let's count how each document ranks with respect to the training corpus

In [17]:
collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus

Counter({1: 147620, 0: 14841})

In [20]:
147620/162461*100

90.86488449535581

Basically, 91% of the inferred documents are found to be most similar to itself and 9% it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

In [22]:
len(ranks)

162461

In [18]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print('({},{}),{})'.format(label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (162460): «consultation on revised pace codes of practice covering letter information outlining draft changes to pace codes information outlining draft changes to pace codes of practice and advice for sending responses consultation on revised pace codes of practice covering letter pdf kb pages this file may not be suitable for users of assistive technology request an accessible format if you use assistive technology such as screen reader and need version of this document in more accessible format please email homeoffice gsi gov uk please tell us what format you need it will help us if you say what assistive technology you use»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

(MOST,(162460, 0.9566476941108704)),consultation on revised pace codes of practice covering letter information outlining draft changes to pace codes information outlining draft changes to pace codes of practice and advice for sending responses consultation on revised pace codes of

#### Sampling docs to get a faster measure of global auto-similarity

##### TRAIN

In [75]:
def evaluate_model(train_corpus):
    train_auto_nearest = []
    random.seed(1234)
    sample_1000 = random.sample(train_corpus, 1000)

    for doc_id in progressbar.progressbar(range(len(sample_1000))):
        inferred_vector = model.infer_vector(sample_1000[doc_id].words)
        sims = model.docvecs.most_similar([inferred_vector], topn=2)
        found_itself_nearest = int(np.where(sims[0][0]==sample_1000[doc_id].tags[0], 1, 0))
        train_auto_nearest.append(found_itself_nearest)
    
    
    x = collections.Counter(train_auto_nearest)
    train_percent_auto_similar = x[1]/(x[0]+x[1])*100
    
    
    
    print("The percentage of 1000 training samples which found itself nearest = {}".format(train_percent_auto_similar
                                                                                          )
         )
    return train_percent_auto_similar 


In [76]:
evaluate_model(train_corpus)

100% (1000 of 1000) |####################| Elapsed Time: 0:00:20 Time:  0:00:20


The percentage of 1000 training samples which found itself nearest = 90.8


90.8

~91% auto-similarity in the sample of 1000. Think this is a viable approach for measuring models

# Testing the Model
Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [63]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (7672): «teacher misconduct panel outcome ms romina albarran outcome of march hearing panel decision and reasons on behalf of the secretary of state for education the secretary of state does not make these decisions himself they are made by senior official on the recommendation of an independent panel teacher reference number teacher date of birth june location teacher worked windsor south east date of professional conduct panel march outcome type prohibition order prohibition order effective april notice is hereby given that in accordance with the teachers disciplinary england regulations professional conduct panel was convened to consider the case of ms romina albarran of windsor south east the proceedings were held at butts road earlsdon park coventry cv bh at am on march teacher misconduct butts road earlsdon park coventry cv bh email misconduct teacher education gov uk telephone prohibition order ms romina albarran pdf kb pages»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc

In [32]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=10)

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1), ('THIRD', 2), ('FOURTH', 3), ('FIFTH', 4)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (29288): «eu foreign ministers discuss ukraine foreign secretary william hague attended the informal eu foreign ministers meeting in athens to discuss the situation in ukraine speaking to media before the eu foreign affairs council meeting in athens today april the foreign secretary william hague said it is important to keep strength and unity in the european union on recent events particularly in ukraine it is important to continue to make clear that what russia has done in violating the independence and sovereignty of neighbouring nation state can not be something that is accepted in europe in the st century it important for us to continue to discuss what we do in the future to reduce europe energy dependence on russia to make sure that we change the balance of leverage between russia and the eu in the future it very important for us to remain strong and united about the sanctions that we have implemented against individuals in russia and crimea and to prepare more far 

### Train on all of GOV.UK -most recent content

### Taxons

In [78]:
labelled = pd.read_csv(os.path.join(DATADIR, "labelled.csv.gz"), compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [79]:
taxons = labelled.taxon_id.unique()

In [None]:
# taxon_homogeneity = []
# for taxon in progressbar.progressbar(taxons):
#     items_in_taxon = list(tag_corpus(labelled[labelled.taxon_id==taxon], tokens_only=True))
#     taxon_size = len(items_in_taxon)
#     cosine_results = []
    
#     for i in progressbar.progressbar(items_in_taxon):
#         for j in items_in_taxon:
#             cosine_results.append(cosine(model.infer_vector(i), model.infer_vector(j)))
    
#     mean_cosine_for_taxon = np.mean(np.array(cosine_results))
#     taxon_homogeneity.append([taxon, taxon_size, mean_cosine_for_taxon])

In [108]:
embeddings = []
text = list(tag_corpus(labelled, tokens_only=True))
for i in progressbar.progressbar(text):
    embedding = model.infer_vector(i)
    embeddings.append(embedding)

100% (285097 of 285097) |################| Elapsed Time: 1:43:36 Time:  1:43:36


In [109]:
embeddings_np = np.asarray(embeddings)

In [110]:
embeddings_np.shape

(285097, 50)

In [111]:
labelled.shape

(285097, 19)

In [116]:
taxon_homogeneity = []
for taxon in tqdm_notebook(taxons):
    embeddings_in_taxon = list(embeddings_np[labelled.taxon_id==taxon])
    taxon_size = len(embeddings_in_taxon)
    cosine_results = []
    
    for i in tqdm_notebook(embeddings_in_taxon):
        for j in embeddings_in_taxon:
            cosine_results.append(cosine(i, j))
    
    mean_cosine_for_taxon = np.mean(np.array(cosine_results))
    taxon_homogeneity.append([taxon, taxon_size, mean_cosine_for_taxon])

HBox(children=(IntProgress(value=0, max=1587), HTML(value='')))

HBox(children=(IntProgress(value=0, max=888), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2321), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6723), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5311), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7935), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1510), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6613), HTML(value='')))

HBox(children=(IntProgress(value=0, max=494), HTML(value='')))

HBox(children=(IntProgress(value=0, max=425), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2677), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1560), HTML(value='')))

HBox(children=(IntProgress(value=0, max=678), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8284), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1508), HTML(value='')))

HBox(children=(IntProgress(value=0, max=9393), HTML(value='')))

HBox(children=(IntProgress(value=0, max=17448), HTML(value='')))

HBox(children=(IntProgress(value=0, max=138), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2065), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2401), HTML(value='')))

HBox(children=(IntProgress(value=0, max=458), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3955), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1354), HTML(value='')))

HBox(children=(IntProgress(value=0, max=288), HTML(value='')))

HBox(children=(IntProgress(value=0, max=130), HTML(value='')))

HBox(children=(IntProgress(value=0, max=53), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3479), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4682), HTML(value='')))

KeyboardInterrupt: 