In [1]:
# import gensim
import collections
import random
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.contrib.tensorboard.plugins import projector

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
import progressbar
from tqdm import tnrange, tqdm_notebook
import scipy
from scipy.spatial.distance import cosine, pdist

import altair as alt


In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [3]:
!mkdir ../combined_text_full

mkdir: ../combined_text_full: File exists


In [4]:
LOG_DIR = "../combined_text_full"
path_for_metadata = os.path.join(LOG_DIR,'metadata.tsv')

In [5]:
DATADIR = os.getenv("DATADIR")

In [6]:
labelled = pd.read_csv(os.path.join(DATADIR, 'labelled.csv.gz'), compression='gzip', low_memory=False)

In [7]:
taxon_id_to_base_path = dict(zip(labelled['taxon_id'], labelled['taxon_base_path']))
type(taxon_id_to_base_path)

dict

In [8]:
labelled.content_id.nunique()

202551

In [9]:
labelled['brexit'] = np.where(labelled['level2taxon']=='Brexit', 1, 0)

In [10]:
labelled.brexit.value_counts()

0    298142
1       918
Name: brexit, dtype: int64

In [11]:
# corpus_sample = labelled.sample(n=20000, random_state=1234)
corpus_sample = labelled.sample(n=2000, random_state=1234)
corpus = labelled['combined_text'].tolist()

In [12]:
corpus_sample[corpus_sample['taxon_base_path']=='/transport/rail-interoperability']

Unnamed: 0,base_path,content_id,description,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,body,combined_text,taxon_id,taxon_base_path,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon,level5taxon,brexit


In [13]:
short_corpus=[]
for text in corpus:
    words = text.split()
    truncated = " ".join(words[0:300])
    short_corpus.append(truncated)

In [14]:
corpus_sample['brexit'] = np.where(corpus_sample['level2taxon']=='Brexit', 1, 0) 

In [15]:
corpus_sample.brexit.value_counts()

0    1992
1       8
Name: brexit, dtype: int64

In [16]:
with open(path_for_metadata,'w') as f:
    f.write("Index\tTitle\tTaxon1\tTaxon2\tbrexit\n")
    for index, row in corpus_sample.iterrows():
        f.write("{}\t{}\t{}\t{}\t{}\n".format(index,row['title'], row['level1taxon'],row['level2taxon'], row['brexit']))

In [17]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

# Reduce logging output.
# tf.logging.set_verbosity(tf.logging.ERROR)
with tf.Session() as session:
    
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embedded_sentences = session.run(embed(short_corpus))
#     session.run(embed(corpus))

with tf.Session() as sess:
    # for tensorboard
    emb = tf.Variable(embedded_sentences, name='embedded_sentences')
    sess.run(emb.initializer)
    config = projector.ProjectorConfig()
    summary_writer = tf.summary.FileWriter(LOG_DIR)
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = emb.name

    # Comment out if you don't have metadata
    embedding.metadata_path = path_for_metadata

    projector.visualize_embeddings(summary_writer, config)
    saver = tf.train.Saver([emb])
    saver.save(sess, os.path.join(LOG_DIR, 'combined_text_sample_full.ckpt'), 1)
    print("Model saved in path: %s" % os.path.join(LOG_DIR, 'combined_text_sample_full.ckpt'))

INFO:tensorflow:Using /var/folders/jy/47p744c95hz67738zkn74rwr0002j9/T/tfhub_modules to cache modules.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Model saved in path: ../combined_text_full/combined_text_sample_full.ckpt


In [18]:
taxons = corpus_sample['taxon_id'].unique()

In [19]:
embedded_sentences.shape

(299060, 512)

In [20]:
embedded_sentences[0]

array([ 5.09653287e-03, -3.03856395e-02,  4.20806296e-02,  3.83644179e-02,
        6.99705025e-03, -6.02658056e-02, -3.61361690e-02, -3.78090143e-02,
       -5.54754473e-02,  5.74985556e-02, -2.14449037e-02,  5.95337972e-02,
       -4.30984311e-02,  6.42405152e-02, -3.42633836e-02, -5.60437366e-02,
       -5.25676869e-02, -2.89425831e-02,  5.98353483e-02, -2.26976220e-02,
        2.38842587e-03, -2.36237124e-02, -2.46080514e-02,  6.68373406e-02,
        6.58454597e-02,  5.23099229e-02, -4.57108915e-02,  2.22072434e-02,
       -6.08198605e-02, -5.48073649e-02, -2.27992740e-02, -5.94624206e-02,
       -6.01998232e-02, -4.33398262e-02, -2.78542787e-02,  4.85054813e-02,
       -6.69691488e-02, -4.76457402e-02, -3.20245996e-02, -4.53154147e-02,
       -5.58042759e-03, -2.85843313e-02, -4.69415598e-02,  6.20114729e-02,
       -3.83535065e-02,  5.52074350e-02,  2.86188279e-03, -5.70123494e-02,
        5.16111702e-02, -6.61586449e-02,  5.19253947e-02, -5.30294664e-02,
        6.25233352e-02, -

In [21]:
np.save('embedded_sentences.npy', embedded_sentences)

### remove duplicate content items 
Haven't done this already because each row represents a different taxon. Need this for the embedding projector visualisation.



In [22]:
embedded_docs_unique = embedded_sentences[~labelled.duplicated('content_id')]


In [23]:
from itertools import compress
short_corpus_unique = list(compress(short_corpus, list(~labelled.duplicated('content_id').values)))

In [24]:
len(short_corpus_unique)

202551

In [25]:
labelled.drop_duplicates('content_id').shape

(202551, 20)

In [381]:
labelled_unique = labelled.drop_duplicates('content_id').copy()

In [27]:
len(embedded_docs_unique)

202551

### Brute force approaches (less memory)

In [100]:
#this woud probably take about 10 years to run

# doc_to_doc_df = pd.DataFrame(columns=['doc1', 'doc2', 'cosine'])
# for index1, vector1 in enumerate(tqdm_notebook(embedded_sentences)):
#     doc1 = labelled['content_id'].iloc[index1]
#     doc1_df = pd.DataFrame(columns=['doc1', 'doc2', 'cosine'])
#     for index2, vector2 in enumerate(tqdm_notebook(embedded_sentences)):
#         doc2 = labelled['content_id'].iloc[index2]
#         x = pd.DataFrame({'doc1': [doc1], 'doc2': [doc2], 'cosine': [cosine(vector1, vector2)]})
#         doc1_df = doc1_df.append(x, ignore_index=True)
#         doc1_df = doc1_df.sort_values('cosine').head(10)
#         doc_to_doc_df = pd.concat([doc_to_doc_df, doc1_df], ignore_index=True)      

### scipy pairwise matrix

In [None]:
# %timeit similarity_matrix = pdist(embedded_docs_unique[0:100], 'cosine')

In [None]:
# %timeit cosine(embedded_sentences[0], embedded_sentences[1])

# Sklearn pairwise

In [None]:
from sklearn.metrics import pairwise_distances_chunked


def reduce_func(D_chunk, start):
    top_k_indices = np.argpartition(D_chunk
                                    , range(20))[:20]
    return top_k_indices[ :, :20], D_chunk[:, top_k_indices[ :, :20]]

In [213]:
top_k_indices, D_chunk= next(gen)
top_k_indices

array([[     1,    269,   2302,    190,   2591, 125379,    181, 125344,
          2624,   2056, 125386, 144717,   6195,   5206,  43336,   1126,
          2737,   5738,   6258,   5634]])

In [221]:
D_chunk

array([[[0.        , 0.0879252 , 0.09235996, 0.10493118, 0.10539824,
         0.10968202, 0.10999715, 0.11638749, 0.1240173 , 0.13116467,
         0.13868237, 0.13911742, 0.14018703, 0.1414088 , 0.14152086,
         0.14347136, 0.14893925, 0.14912212, 0.15121639, 0.15260935]]],
      dtype=float32)

In [222]:
D_chunk[D_chunk<0.1].shape[0]

3

In [224]:
np.arange(0, 0.21, 0.01)

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 ])

In [225]:
np.empty((0,1), int)

array([], shape=(0, 1), dtype=int64)

In [237]:
embedded_docs_unique.shape

(202551, 512)

In [239]:
embedded_docs_unique[np.random.choice(embedded_docs_unique.shape[0], 2, replace=False), :]

array([[ 0.03912584,  0.01735858, -0.00092473, ..., -0.04953542,
        -0.01437556, -0.02579409],
       [ 0.05207291,  0.00755737,  0.00945574, ...,  0.01553024,
         0.02057066, -0.06777682]], dtype=float32)

### Distribution of number of links slected at different thresholds

In [387]:
gen = pairwise_distances_chunked(embedded_docs_unique, 
                                 reduce_func=reduce_func, 
                                 working_memory=0, 
                                 metric='cosine', 
                                 n_jobs=-1)

In [388]:
links_thresholds = pd.DataFrame(columns=['vector_id', 'thresholds', 'numlinks'])
for i, (_, D_chunk) in enumerate(tqdm_notebook(gen)):     
    numlinks = []
    for threshold in np.arange(0, 0.42, 0.02):
        numlinks.append(D_chunk[D_chunk<threshold].shape[0])
    i_links_thresholds = pd.DataFrame({'vector_id': i, 
                             'thresholds': np.arange(0, 0.42, 0.02), 
                             'numlinks': np.asarray(numlinks).astype(int)})
    links_thresholds = links_thresholds.append(i_links_thresholds, ignore_index=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  (working_memory, np.ceil(row_bytes * 2 ** -20)))


KeyboardInterrupt: 

In [389]:
links_thresholds['numlinks'] = links_thresholds['numlinks'].astype(int)

In [390]:
links_thresholds['numlinks_minus_self'] = links_thresholds['numlinks']-1

In [391]:
links_thresholds['numlinks_minus_self'].groupby(links_thresholds['thresholds']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
thresholds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2000.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
0.02,2000.0,1.125,3.870771,0.0,0.0,0.0,0.0,19.0
0.04,2000.0,1.897,5.092382,0.0,0.0,0.0,0.0,19.0
0.06,2000.0,2.5295,5.657219,0.0,0.0,0.0,1.0,19.0
0.08,2000.0,3.447,6.26361,0.0,0.0,0.0,3.0,19.0
0.1,2000.0,5.03,7.10291,0.0,0.0,1.0,7.0,19.0
0.12,2000.0,7.1685,7.861362,0.0,0.0,3.0,19.0,19.0
0.14,2000.0,9.3515,8.090035,0.0,1.0,7.0,19.0,19.0
0.16,2000.0,11.6215,7.866713,0.0,3.0,16.0,19.0,19.0
0.18,2000.0,13.6655,7.248302,0.0,7.0,19.0,19.0,19.0


### Save out a list to spot check

In [409]:
labelled_unique.publishing_app.value_counts()

whitehall                163978
specialist-publisher      36122
publisher                  2285
hmrc-manuals-api             77
manuals-publisher            34
smartanswers                 31
collections-publisher        22
calendars                     2
Name: publishing_app, dtype: int64

In [431]:
labelled_unique.shape

(202551, 21)

In [432]:
embedded_docs_unique.shape

(202551, 512)

In [434]:
embedded_docs_publisher = embedded_docs_unique[labelled_unique['publishing_app']=='publisher']

In [414]:
labelled_unique['url'] = 'www.gov.uk'+labelled_unique.base_path

In [438]:
labelled_publisher = labelled_unique[labelled_unique['publishing_app']=='publisher']

In [435]:
embedded_docs_publisher.shape

(2285, 512)

In [439]:
labelled_publisher.shape

(2285, 21)

In [452]:
gen_pub = pairwise_distances_chunked(X=embedded_docs_publisher,
                                 Y=embedded_docs_unique, 
                       
                                     reduce_func=reduce_func, 
                                 working_memory=0, 
                                 metric='cosine', 
                                 n_jobs=-1)

In [None]:
labelled_unique['url'].iloc[0]

In [453]:
source_links = []
target_links = []
urls = pd.DataFrame(columns=['source_url', 'suggested_links', 'cosine_sims'])
for i, (indices, values) in enumerate(tqdm_notebook(gen_pub)):

    source_url = labelled_unique.iat[indices[0][0], -1]
    target_urls = pd.Series(labelled_unique['url'].iloc[indices.reshape(20)]) 
#     cosine_sims = pd.Series(values)
    i_urls = pd.DataFrame({'source_url': source_url, 
                             'suggested_links': target_urls, 
                             'cosine_sims': values.reshape(20)})
    urls = urls.append(i_urls, ignore_index=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [454]:
#get a sample with more mainstream content in it
url_publishing_dict = dict(zip(labelled_unique.url, labelled_unique.publishing_app))
urls_moremainstream = urls.copy()
urls_moremainstream['publishing_app'] = urls_moremainstream['source_url'].map(url_publishing_dict)
urls_moremainstream.publishing_app.value_counts()

publisher    45700
Name: publishing_app, dtype: int64

In [457]:
4000/20

200.0

In [458]:
urls.to_csv("spot_check_google_links_publisher_all.csv", index=False)

These links ALL look really bad. 
Need to check indexing. 
Start with doc2vec?

### Generate links for ingesting into content_store

In [400]:
def reduce_to_top_3(D_chunk, start):
    top_k_indices = np.argpartition(D_chunk
                                    , range(3))[:3]
    return top_k_indices[ :, :3], D_chunk[:, top_k_indices[ :, :3]]

In [403]:
gen3 = pairwise_distances_chunked(embedded_docs_unique, 
                                 reduce_func=reduce_to_top_3, 
                                 working_memory=0, 
                                 metric='cosine', 
                                 n_jobs=-1)

In [404]:
# import json
# with open('data.txt', 'w') as f:
#   json.dump(data, f, ensure_ascii=False)


source_links = []
target_links = []

for i, (indices, values) in enumerate(tqdm_notebook(gen3)):
    
    source_content_id = labelled_unique.iat[i, 1]
    target_content_ids = list(labelled_unique['content_id'][labelled_unique.index.isin(indices.reshape(3))]) 
    source_links.append(source_content_id)
    target_links.append(target_content_ids)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [405]:
import json
with open('sample_of_data_29NOV19_3links.json', 'w') as f:
  json.dump(dict(zip(source_links, target_links)), f, ensure_ascii=False)

In [90]:
pd.DataFrame(D_chunk.transpose()).sort_values(0, ascending=False).head(5)

Unnamed: 0,0
70692,1.036419
169865,1.032602
67273,1.024274
118524,1.024116
69065,1.022117


In [91]:
D_chunk[:, top_k_indices][:, :, :5]

array([[[1.0364193, 1.0326018, 1.024274 , 1.0241157, 1.0221168]]],
      dtype=float32)

In [92]:
top_k_indices[ :, :20]

array([[ 70692, 169865,  67273, 118524,  69065, 185805,  65504,  65408,
        198593, 197511, 200417, 200539, 105339,  64160, 155094, 196342,
        202150, 185842,  87264,  84817]])

In [100]:
cosine(embedded_docs_unique[1], embedded_docs_unique[35802])

1.0468005575239658

In [63]:
cosine(embedded_docs_unique[0], embedded_docs_unique[169865])

1.0326018668711185

In [128]:
labelled_unique['combined_text'][1]

'shingles vaccine update supply of shingles vaccine will be restored as of today (4 december 2013). shingles or herpes zoster is an infection of a nerve and the area of skin around it. in serious cases it causes a rash of very painful fluid filled blisters on the skin that can burst and turn into sores that eventually crust over and heal. people aged 70 and 79 on 1 september 2013 are being offered the shingles vaccine this year as part of a vaccination programme to reduce the incidence and severity of shingles disease in older people. the cohort of 70 to 79 year olds was identified as the group most likely to benefit from vaccination because they have a high incidence of shingles and related complications. above the age of 80 the vaccine effectiveness decreases substantially so the joint committee on vaccination and immunisation ( jcvi ) decided that the optimal age to routinely offer protection to those at risk would be at 70 years of age with a phased catch up programme for those age

In [133]:
labelled_unique['combined_text'][190]

'people aged 70 and 79 years offered shingles vaccine this vaccination programme aims to reduce the incidence and severity of shingles disease in older people. people aged 70 and 79 will be offered a shingles vaccination this year as part of a vaccination programme to reduce the incidence and severity of shingles disease in older people. shingles or herpes zoster is an infection of a nerve and the area of skin around it. in serious cases it causes a rash of very painful fluid filled blisters on the skin that can burst and turn into sores that eventually crust over and heal. a catch up programme to protect those aged 70 to 79 years will also be rolled out over several years starting with those aged 79 years on 1st september 2013. it’s estimated 800 000 people in the uk will be eligible for the vaccine in the first year. dr paul cosford director for health protection and medical director at phe said: shingles is caused by the same virus that causes chickenpox. when you recover from chick

For each taxon get a mean cosine distance for all pairs within the taxon

In [None]:
taxon_homogeneity = []
for taxon in tqdm_notebook(taxons):
  taxon_embeddings = embedded_sentences[corpus_sample['taxon_id']==taxon]
  taxon_size = taxon_embeddings.shape[0]
  cosine_results = []
  for i in taxon_embeddings:
    for j in taxon_embeddings:
      cosine_results.append(cosine(i, j))
  mean_cosine_for_taxon = np.mean(np.array(cosine_results))
  

  taxon_homogeneity.append([taxon, taxon_size, mean_cosine_for_taxon])
      
  

In [None]:
taxon_homogeneity_df = pd.DataFrame(taxon_homogeneity, columns = ['taxon_id', 'taxon_size', 'mean_cosine_score']).sort_values('mean_cosine_score', ascending=False)
taxon_homogeneity_df['taxon_base_path'] = taxon_homogeneity_df['taxon_id'].map(taxon_id_to_base_path)
taxon_homogeneity_df.shape

In [None]:
taxon_homogeneity_df

In [None]:
alt.Chart(taxon_homogeneity_df).mark_circle(size=60).encode(
    x='taxon_size',
    y='mean_cosine_score',
    tooltip=['taxon_base_path']
).interactive()

For each content item, Find the nearest pages