In [1]:
# import gensim
import collections
import random
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.contrib.tensorboard.plugins import projector

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
import progressbar

import altair as alt


In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [3]:
LOG_DIR = "universal_embeddings"
path_for_metadata = os.path.join(LOG_DIR,'metadata.tsv')

In [4]:
DATADIR = os.getenv("DATADIR")

In [5]:
labelled = pd.read_csv(os.path.join(DATADIR, 'labelled.csv.gz'), compression='gzip', low_memory=False)

In [6]:
labelled.tail()

Unnamed: 0,base_path,content_id,description,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,body,combined_text,taxon_id,taxon_base_path,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon,level5taxon
285092,/guidance/tonnage-tax-for-shipping-companies,601f0b66-7631-11e4-a3cb-005056011aef,understand how you can pay tonnage tax if you ...,detailed_guide,2011-12-13T00:00:00.000+00:00,en,HM Revenue & Customs,whitehall,find out how to pay tonnage tax if you're a sh...,overview tonnage tax is a form of corporation ...,find out how to pay tonnage tax if you're a sh...,0f0ba370-1824-4326-85a3-c4038a0a3783,/money/business-tax-international-tax-shipping,Shipping,Money,Business tax,International tax,Shipping,
285093,/government/collections/infectious-diseases-in...,bf520108-e372-4a6e-ad8d-9e7ed7bd065d,information for healthcare professionals provi...,document_collection,2016-03-02T15:04:00.000+00:00,en,Public Health England,whitehall,infectious diseases in pregnancy screening: cl...,these documents explain the procedures for pro...,infectious diseases in pregnancy screening: cl...,1fe05c70-0bd9-4725-9ed4-3abc0f5f7961,/health-and-social-care/population-screening-p...,Commission and provide services,Health and social care,Population screening programmes,NHS infectious diseases in pregnancy screening...,Commission and provide services,
285094,/guidance/rhoi-gwybod-am-farwolaethau-gwartheg...,5fa585e4-7631-11e4-a3cb-005056011aef,mae'n rhaid i ladd-dai roi gwybod am symudiada...,detailed_guide,2016-04-28T10:18:35.000+00:00,en,British Cattle Movement Service,whitehall,rhoi gwybod am farwolaethau gwartheg ar adeg e...,mae’n rhaid i ladd dai wneud y canlynol: rhoi ...,rhoi gwybod am farwolaethau gwartheg ar adeg e...,322251ef-586c-47df-8869-fa73b6e3aa7c,/environment/keeping-farmed-animals-cattle-dea...,Cymraeg,Environment,Food and farming,Keeping farmed animals,Cattle deaths,Cymraeg
285095,/guidance/rhoi-gwybod-am-farwolaethau-gwartheg...,5fa56edb-7631-11e4-a3cb-005056011aef,pan fydd gwartheg yn marw ar y fferm neu ar sa...,detailed_guide,2016-04-28T10:18:35.000+00:00,en,British Cattle Movement Service,whitehall,rhoi gwybod am farwolaethau gwartheg ar y dali...,mae’n rhaid i chi roi gwybod am unrhyw wartheg...,rhoi gwybod am farwolaethau gwartheg ar y dali...,322251ef-586c-47df-8869-fa73b6e3aa7c,/environment/keeping-farmed-animals-cattle-dea...,Cymraeg,Environment,Food and farming,Keeping farmed animals,Cattle deaths,Cymraeg
285096,/guidance/british-forces-post-office-services,5c8f52ae-7631-11e4-a3cb-005056011aef,how to use the british forces post office clai...,detailed_guide,2012-12-12T19:27:00.000+00:00,en,Ministry of Defence,whitehall,bfpo services guide,follow our short guide to find out how to send...,bfpo services guide how to use the british for...,ffedc568-ce84-43f6-b2a8-d749f24c13c7,/defence/defence-armed-forces-support-services...,British Forces Post Office,Defence,Support services for military and defence pers...,British Forces Post Office,,


In [7]:
taxon_id_to_base_path = dict(zip(labelled['taxon_id'], labelled['taxon_base_path']))
type(taxon_id_to_base_path)

dict

In [8]:
labelled.content_id.nunique()

187073

In [9]:
labelled['brexit'] = np.where(labelled['level2taxon']=='Brexit', 1, 0)

In [10]:
labelled.brexit.value_counts()

0    284604
1       493
Name: brexit, dtype: int64

In [11]:
corpus_sample = labelled.sample(n=20000, random_state=1234)
corpus = corpus_sample['combined_text'].tolist()

In [None]:
short_corpus=[]
for text in corpus:
    words = text.split()
    truncated = " ".join(words[0:200])
    short_corpus.append(truncated)

In [12]:
corpus_sample['brexit'] = np.where(corpus_sample['level2taxon']=='Brexit', 1, 0)

In [13]:
corpus_sample.brexit.value_counts()

0    19964
1       36
Name: brexit, dtype: int64

In [14]:
with open(os.path.join(LOG_DIR,'metadata.tsv'),'w') as f:
    f.write("Index\tTitle\tTaxon1\tTaxon2\tbrexit\n")
    for index, row in corpus_sample.iterrows():
        f.write("{}\t{}\t{}\t{}\t{}\n".format(index,row['title'], row['level1taxon'],row['level2taxon'], row['brexit']))

In [15]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

# Reduce logging output.
# tf.logging.set_verbosity(tf.logging.ERROR)
with tf.Session() as session:
    
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embedded_sentences = session.run(embed(short_corpus))
#     session.run(embed(corpus))

with tf.Session() as sess:
    # for tensorboard
    emb = tf.Variable(embedded_sentences, name='embedded_sentences')
    sess.run(emb.initializer)
    config = projector.ProjectorConfig()
    summary_writer = tf.summary.FileWriter(LOG_DIR)
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = emb.name

    # Comment out if you don't have metadata
    embedding.metadata_path = os.path.join('metadata.tsv')

    projector.visualize_embeddings(summary_writer, config)
    saver = tf.train.Saver([emb])
    saver.save(sess, os.path.join(LOG_DIR, 'model2.ckpt'), 1)
    print("Model saved in path: %s" % os.path.join(LOG_DIR, 'model2.ckpt'))

#   for i, train_embedding in enumerate(np.array(train_embeddings).tolist()):
#     print("Text: {}".format(train_corpus[i]))
#     print("Embedding size: {}".format(len(train_embedding)))
#     train_embedding_snippet = ", ".join(
#         (str(x) for x in train_embedding[:3]))
#     print("Embedding: [{}, ...]\n".format(train_embedding_snippet))

INFO:tensorflow:Using /var/folders/jy/47p744c95hz67738zkn74rwr0002j9/T/tfhub_modules to cache modules.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


NameError: name 'save_path' is not defined

In [None]:
import tensorflow.contrib.slim as slim
def model_summary():
    model_vars = tf.trainable_variables()
    slim.model_analyzer.analyze_vars(model_vars, print_info=True)

model_summary()

In [None]:
from sklearn.manifold import TSNE

print(embedded_sentences.shape)
tsne_emb = TSNE(n_components=2).fit_transform(embedded_sentences)
tsne_emb.shape

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(tsne_emb[:, 0], tsne_emb[:, 1],s=0.1)


for i, txt in enumerate(corpus_sample['title']):
    if (i % 50 == 0):
        ax.annotate(txt, (tsne_emb[:, 0][i], tsne_emb[:, 1][i]))
    

In [None]:


tsne_df = pd.DataFrame({'dimension1':tsne_emb[:,0],'dimension2':tsne_emb[:,1], 'taxon':corpus_sample['taxon_name']})
import seaborn as sns
 
# Use the 'hue' argument to provide a factor variable
sns.lmplot( x="dimension1", y="dimension2", data=tsne_df, fit_reg=False, hue='taxon', legend=False)

 
# Move the legend tout of the plot
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
 
#sns.plt.show()

    

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2).fit_transform(embedded_sentences)

In [None]:
pca_df = pd.DataFrame({'dimension1':pca[:,0],'dimension2':pca[:,1], 'taxon':corpus_sample['taxon_name']})
 
# Use the 'hue' argument to provide a factor variable
sns.lmplot( x="dimension1", y="dimension2", data=pca_df, fit_reg=False, hue='taxon', legend=False)

 
# Move the legend tout of the plot
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
 
#sns.plt.show()

In [None]:
# Create some variables.
# emb = tf.Variable(embeddings, name='word_embeddings')
# # Save the variables to disk.
# save_path = saver.save(session, "model_dir/model.ckpt")
# print("Model saved in path: %s" % save_path)


In [None]:
def plot_similarity(labels, features, rotation):
  corr = np.inner(features, features)
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")


def run_and_plot(session_, input_tensor_, messages_, encoding_tensor):
  message_embeddings_ = session_.run(
      encoding_tensor, feed_dict={input_tensor_: messages_})
  plot_similarity(messages_, message_embeddings_, 90)

In [None]:
# similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
# similarity_encodings = embed(similarity_input_placeholder)
# with tf.Session() as session:
#   session.run(tf.global_variables_initializer())
#   session.run(tf.tables_initializer())
#   run_and_plot(session, similarity_input_placeholder, train_corpus,
#                similarity_encodings)

In [None]:
import scipy
from scipy.spatial.distance import cosine

In [None]:
cosine(train_embeddings[0], train_embeddings[0])

In [None]:
def find_most_similar(index_document_embedding, all_document_embeddings):
  lowest_cosine = np.float(1)
  nearest_index = 0
  for index, document_embedding in enumerate(all_document_embeddings):
    cos = cosine(index_document_embedding, document_embedding)
    
    if cos < lowest_cosine:
      lowest_cosine = cos
      nearest_index = index
      
      
  return lowest_cosine, nearest_index

found_themselves_nearest =[]
for first_index, document in enumerate(train_embeddings):
  cos, second_index = find_most_similar(document, train_embeddings)
  print(first_index, second_index)
  found_itself_nearest = int(np.where(first_index==second_index, 1, 0))
  found_themselves_nearest.append(found_itself_nearest)
  

In [None]:
labelled_sample = labelled.sample(n=10)

labelled_corpus = labelled['combined_text'].tolist()

In [None]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)


# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  embeddings = session.run(embed(labelled_corpus))

  for i, train_embedding in enumerate(np.array(embeddings).tolist()):
    print("Text: {}".format(train_corpus[i]))
    print("Embedding size: {}".format(len(train_embedding)))
    train_embedding_snippet = ", ".join(
        (str(x) for x in train_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(train_embedding_snippet))

In [None]:
taxons = corpus_sample['taxon_id'].unique()

In [None]:
taxon_embeddings = embeddings[corpus_sample['taxon_id']=='1327984f-95e0-4ca7-94c7-c63e69c30924']

cosine_results = []
for i in taxon_embeddings:
  for j in taxon_embeddings:
    cosine_results.append(cosine(i, j))
mean_cosine_for_taxon = np.mean(np.array(cosine_results))

In [None]:
mean_cosine_for_taxon

In [None]:
taxon_homogeneity = []
for taxon in progressbar.progressbar(taxons):
  taxon_embeddings = embeddings[corpus_sample['taxon_id']==taxon]
  taxon_size = taxon_embeddings.shape[0]
  cosine_results = []
  for i in taxon_embeddings:
    for j in taxon_embeddings:
      cosine_results.append(cosine(i, j))
  mean_cosine_for_taxon = np.mean(np.array(cosine_results))
  

  taxon_homogeneity.append([taxon, taxon_size, mean_cosine_for_taxon])
      
  

In [None]:
taxon_homogeneity_df = pd.DataFrame(taxon_homogeneity, columns = ['taxon_id', 'taxon_size', 'mean_cosine_score']).sort_values('mean_cosine_score', ascending=False)
taxon_homogeneity_df['taxon_base_path'] = taxon_homogeneity_df['taxon_id'].map(taxon_id_to_base_path)
taxon_homogeneity_df

In [None]:
alt.Chart(taxon_homogeneity_df).mark_circle(size=60).encode(
    x='taxon_size',
    y='mean_cosine_score',
    tooltip=['taxon_base_path']
).interactive()

# Inferring a Vector
One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [None]:
print(model.infer_vector(train_corpus[0].words))
print(train_corpus[0].tags)

Note that infer_vector() does not take a string, but rather a list of string tokens, which should have already been tokenized the same way as the words property of original training document objects.

Also note that because the underlying training/inference algorithms are an iterative approximation problem that makes use of internal randomization, repeated inferences of the same text will return slightly different vectors.

# Assessing Model
To assess our new model, we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. Basically, we're pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that we've likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. Additionally, we'll keep track of the second ranks for a comparison of less similar documents.

In [None]:
ranks = []
second_ranks = []

for doc_id in progressbar.progressbar(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    found_itself_nearest = int(np.where(sims[0][0]==train_corpus[doc_id].tags[0], 1, 0))
#     rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(found_itself_nearest)
    
    second_ranks.append(sims[1])

Let's count how each document ranks with respect to the training corpus

In [None]:
collections.Counter(ranks)  # Results vary between runs due to random seeding and very small corpus

In [None]:
147620/162461*100

Basically, 91% of the inferred documents are found to be most similar to itself and 9% it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

In [None]:
len(ranks)

In [None]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print('({},{}),{})'.format(label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

#### Sampling docs to get a faster measure of global auto-similarity

##### TRAIN

In [None]:
def evaluate_model(train_corpus):
    train_auto_nearest = []
    random.seed(1234)
    sample_1000 = random.sample(train_corpus, 1000)

    for doc_id in progressbar.progressbar(range(len(sample_1000))):
        inferred_vector = model.infer_vector(sample_1000[doc_id].words)
        sims = model.docvecs.most_similar([inferred_vector], topn=2)
        found_itself_nearest = int(np.where(sims[0][0]==sample_1000[doc_id].tags[0], 1, 0))
        train_auto_nearest.append(found_itself_nearest)
    
    
    x = collections.Counter(train_auto_nearest)
    train_percent_auto_similar = x[1]/(x[0]+x[1])*100
    
    
    
    print("The percentage of 1000 training samples which found itself nearest = {}".format(train_percent_auto_similar
                                                                                          )
         )
    return train_percent_auto_similar 


In [None]:
evaluate_model(train_corpus)

~91% auto-similarity in the sample of 1000. Think this is a viable approach for measuring models

# Testing the Model
Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

In [None]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=10)

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND', 1), ('THIRD', 2), ('FOURTH', 3), ('FIFTH', 4)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))