In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import itertools

## Directories and filenames

In [2]:
# os.listdir('../../data')

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
content_api = os.path.join(DATA_DIR, "content_api")

labelled_file = os.path.join(content_api,"labelled.csv.gz")
edgefile = os.path.join(DATA_DIR, "processed_network", "edges_graphsagetest_feb_01_18_doo_min15weight_wtext.csv.gz")

### 2.1 Labelled data

In [4]:
labelled = pd.read_csv(labelled_file, compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
labelled.columns

Index(['base_path', 'content_id', 'description', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'body', 'combined_text', 'taxon_id',
       'taxon_base_path', 'taxon_name', 'level1taxon', 'level2taxon',
       'level3taxon', 'level4taxon', 'level5taxon'],
      dtype='object')

In [6]:
labelled.shape, labelled[labelled.publishing_app=="publisher"].shape

((302474, 19), (3563, 19))

### 2.2 Training edge list, use for node id mappings

In [7]:
edges = pd.read_csv(edgefile, compression='gzip')

In [8]:
edges

Unnamed: 0,src_node,dest_node,weight,src_cid,dest_cid,source,target
0,/government/publications/guidance-for-dependan...,/visa-fees,66,5ef7560d-7631-11e4-a3cb-005056011aef,1e333395-5dd5-4452-96a3-fbe939928761,33265,58314
1,/visa-fees,/find-a-visa-application-centre,1164,1e333395-5dd5-4452-96a3-fbe939928761,aa055bd6-fde1-4471-8ac2-ac86241b4898,58314,12343
2,/entering-staying-uk/family-visas,/uk-family-visa,377,d612c61e-22f4-4922-8bb2-b04b9202126e,305acc88-488e-44ec-8e5a-1a5de7819ba9,23720,25053
3,/uk-family-visa,/uk-family-visa/partner-spouse,26227,305acc88-488e-44ec-8e5a-1a5de7819ba9,305acc88-488e-44ec-8e5a-1a5de7819ba9,25053,14044
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,2733,305acc88-488e-44ec-8e5a-1a5de7819ba9,5ef421be-7631-11e4-a3cb-005056011aef,14044,33797
5,/government/publications/application-for-uk-vi...,/uk-family-visa/provide-information,116,5ef421be-7631-11e4-a3cb-005056011aef,305acc88-488e-44ec-8e5a-1a5de7819ba9,33797,29781
6,/uk-family-visa/provide-information,/apply-to-come-to-the-uk,32,305acc88-488e-44ec-8e5a-1a5de7819ba9,76698ffe-70ab-4fda-be0d-755234f6d340,29781,24233
7,/apply-to-come-to-the-uk,/apply-to-come-to-the-uk/prepare-your-application,43994,76698ffe-70ab-4fda-be0d-755234f6d340,76698ffe-70ab-4fda-be0d-755234f6d340,24233,20546
8,/apply-to-come-to-the-uk/prepare-your-application,/visa-processing-times,4669,76698ffe-70ab-4fda-be0d-755234f6d340,72ed754c-4c82-415f-914a-ab6760454cb4,20546,31442
9,/visa-processing-times,/apply-to-come-to-the-uk/prepare-your-application,1220,72ed754c-4c82-415f-914a-ab6760454cb4,76698ffe-70ab-4fda-be0d-755234f6d340,31442,20546


### 2.4 Read in top GOV.UK pages for February 2019
Set up a subsmaple to predict for

In [9]:
top = pd.read_csv(os.path.join(DATA_DIR, "top-pages-govuk-feb19.tsv"), sep='\t', usecols=['Page', 'Page Views'])

In [10]:
top['Page'] = top['Page'].map(lambda x: x.replace("https://www.integration.publishing.service.gov.uk",""))
top['Page Views'] = top['Page Views'].map(lambda x: x.replace(',', ''))
top['Page Views'] = top['Page Views'].astype("float")

#### Check which top pages are included in the `labelled` dataset.

In [11]:
top = top[top.Page.isin(labelled.base_path.values)]

In [12]:
top.head()

Unnamed: 0,Page,Page Views
2,/sign-in-universal-credit,8221833.0
3,/log-in-register-hmrc-online-services,6564918.0
4,/sold-bought-vehicle,4722768.0
5,/vehicle-tax,4670761.0
6,/check-mot-history,4666636.0


## 3. Initialize train/test generator data

In [13]:
labelled_cid = set(labelled.content_id.values)
len(labelled_cid)

206038

In [14]:
cid_base = dict(zip(labelled.content_id, labelled.base_path))
base_cid = dict(zip(labelled.base_path, labelled.content_id))

In [15]:
# cid_pairs = []
# basepath_pairs = []
# for v,w in itertools.product([list(mainstream)[1]],labelled_cid):
#     if v!=w:
#         cid_pairs.append((v,w))
#         basepath_pairs.append((cid_base[v], cid_base[w]))
# len(cid_pairs), len(basepath_pairs)

### 3.1 Set up dataframe with top 50 pages

In [16]:
type(top["Page Views"].iloc[0])

numpy.float64

In [17]:
top.sort_values("Page Views", ascending=False)[0:10]

Unnamed: 0,Page,Page Views
2,/sign-in-universal-credit,8221833.0
3,/log-in-register-hmrc-online-services,6564918.0
4,/sold-bought-vehicle,4722768.0
5,/vehicle-tax,4670761.0
6,/check-mot-history,4666636.0
8,/log-in-file-self-assessment-tax-return,3968998.0
10,/state-pension-age,3792312.0
11,/check-vehicle-tax,3769755.0
12,/get-information-about-a-company,3632017.0
13,/pay-self-assessment-tax-bill,3631506.0


In [18]:
### top base_paths to content_ids
top['content_id'] = top['Page'].map(base_cid)
top_50_ids = top.sort_values("Page Views", ascending=False)['content_id'][0:50].values

In [None]:
top

In [19]:
cid_pairs = []
basepath_pairs = []
for v,w in itertools.product(top_50_ids, labelled_cid):
    if v!=w:
        cid_pairs.append((v,w))
        basepath_pairs.append((cid_base[v], cid_base[w]))
len(cid_pairs), len(basepath_pairs)

(10301850, 10301850)

### 3.2 Set up test set for prediction, top pages against the entirety of `labelled`

In [20]:
predict_test = pd.DataFrame({'src_cid':[s for s,_ in cid_pairs], 
                             'dest_cid':[t for _,t in cid_pairs],
                             'src_node':[s for s,_ in basepath_pairs], 
                             'dest_node':[t for _,t in basepath_pairs],
                              })

In [21]:
predict_test.head()

Unnamed: 0,src_cid,dest_cid,src_node,dest_node
0,b220a437-0d51-4390-9993-63345d0c83ad,a4481bd0-8986-4a85-a445-44f10f715349,/sign-in-universal-credit,/employment-tribunal-decisions/mr-m-gurney-v-b...
1,b220a437-0d51-4390-9993-63345d0c83ad,953e4007-b6ce-4d8d-aa20-05a34c3e3bf4,/sign-in-universal-credit,/government/news/moj-unit-in-crackdown-on-rogu...
2,b220a437-0d51-4390-9993-63345d0c83ad,44997587-823b-4663-be55-d5636e406828,/sign-in-universal-credit,/aaib-reports/piper-pa-38-112-tomahawk-g-bjur-...
3,b220a437-0d51-4390-9993-63345d0c83ad,c4b30e72-0311-466f-87a5-35900fb69dda,/sign-in-universal-credit,/guidance/veterinary-medicines-summary-of-prod...
4,b220a437-0d51-4390-9993-63345d0c83ad,4ced5048-608f-4729-96a2-9b1cb12cea19,/sign-in-universal-credit,/government/statistics/uk-house-price-index-oc...


#### *TODO:* Base node_ids on base_paths, not content_id

In [22]:
predict_test.columns

Index(['src_cid', 'dest_cid', 'src_node', 'dest_node'], dtype='object')

In [23]:
node_id = dict(zip(edges.src_node, edges.source))
for node,nid in zip(edges.dest_node, edges.target):
    if node not in node_id.keys():
        node_id[node] = nid

In [24]:
len(node_id)

66619

In [25]:
max_node_id = max(edges.source.max(),edges.target.max()) + 1
for node in set(list(predict_test.src_node) + list(predict_test.dest_node.values)):
    if node not in node_id.keys():
        node_id[node] = max_node_id
        max_node_id+=1

In [26]:
max_node_id

255643

In [27]:
predict_test['source'] = predict_test['src_node'].map(node_id)
predict_test['target'] = predict_test['dest_node'].map(node_id)

In [49]:
predict_test['label'] = "go_to"

### 3.3 Compute embedding vectors 

In [28]:
cid_texts = list(zip(labelled.content_id, labelled.combined_text))

In [51]:
len(cid_texts)

302474

In [52]:
labelled.shape

(302474, 19)

In [30]:
len(cid_texts)

302474

In [40]:
cids = [cid for cid, _ in cid_texts]
texts = [" ".join([token.strip() for token in text.split(" ")[0:300]]) for _, text in cid_texts]

In [41]:
texts[0]

'reims cessna fa152 g-bgxz 27 may 1988   reims cessna fa152 g bgxz download report: reims cessna fa152 g bgxz 09 88.pdf (76.87 kb)'

In [44]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
tf.logging.set_verbosity(tf.logging.DEBUG)
embed = hub.Module(module_url)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embeddings = session.run(embed(texts))

DEBUG:tensorflow:Initialize variable module_1/Embeddings_en/sharded_0:0 from checkpoint b'/var/folders/9s/cvtxyh055w94_xfm_cwjrngh000121/T/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with Embeddings_en/sharded_0
DEBUG:tensorflow:Initialize variable module_1/Embeddings_en/sharded_1:0 from checkpoint b'/var/folders/9s/cvtxyh055w94_xfm_cwjrngh000121/T/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with Embeddings_en/sharded_1
DEBUG:tensorflow:Initialize variable module_1/Embeddings_en/sharded_10:0 from checkpoint b'/var/folders/9s/cvtxyh055w94_xfm_cwjrngh000121/T/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with Embeddings_en/sharded_10
DEBUG:tensorflow:Initialize variable module_1/Embeddings_en/sharded_11:0 from checkpoint b'/var/folders/9s/cvtxyh055w94_xfm_cwjrngh000121/T/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with Embeddings_en/sharded_11
DEBUG:tensorflow:Ini

DEBUG:tensorflow:Initialize variable module_1/SNLI/Classifier/tanh_layer_0/weights:0 from checkpoint b'/var/folders/9s/cvtxyh055w94_xfm_cwjrngh000121/T/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with SNLI/Classifier/tanh_layer_0/weights
DEBUG:tensorflow:Initialize variable module_1/global_step:0 from checkpoint b'/var/folders/9s/cvtxyh055w94_xfm_cwjrngh000121/T/tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/variables/variables' with global_step
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [45]:
len(embeddings)

302474

### Set `node_data` from vectors
Index matches up to `node_id` in edge list dataframe

In [54]:
cid_embs = dict(zip(cids, embeddings))

In [55]:
nid_cid = dict(zip(predict_test['source'], predict_test['src_cid']))
for nid,cid in zip(predict_test['target'], predict_test['dest_cid']):
    if nid not in nid_cid.keys():
        nid_cid[nid] = cid

In [56]:
rowlist = []
index = []
for nid in set(list(predict_test['source']) + list(predict_test['target'])):
    row_dict = {}
    row_dict["content_id"] = nid_cid[nid]
    for i,emb in enumerate(cid_embs[nid_cid[nid]]):
        row_dict[i] = emb
    rowlist.append(row_dict)  
    index.append(nid)
node_data = pd.DataFrame(rowlist, index=index)

In [57]:
node_data

Unnamed: 0,content_id,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,80f7a69c-5cdc-4bd5-a16c-15d669ba2cc5,0.014993,0.024639,0.058578,0.008009,0.008096,-0.062776,0.059777,0.005778,-0.008614,...,0.036414,0.065568,0.044270,-0.046852,0.035862,-0.043941,0.009524,-0.037714,-0.001525,-0.063660
2,5ef9b7da-7631-11e4-a3cb-005056011aef,-0.002469,-0.009080,0.014424,-0.018611,0.004600,-0.048021,0.067327,0.030089,-0.014283,...,-0.043220,0.057214,0.015740,-0.032861,0.047954,-0.016275,0.011618,-0.041357,0.011032,-0.018847
7,763fd665-f01d-469a-88ad-ccc434e5ac6d,-0.042398,-0.042146,0.045832,0.051937,0.016045,-0.027442,-0.004913,-0.051645,-0.065304,...,0.001117,-0.012335,-0.066261,0.019525,0.064946,-0.060854,-0.000948,-0.055518,-0.010972,-0.059757
9,2e7ccd7e-7294-43cf-9576-145e06dd9b3d,0.007414,-0.020822,0.054993,0.052245,0.019539,-0.061916,0.062500,-0.034365,-0.004120,...,0.042527,0.059163,-0.029500,-0.045457,0.045245,-0.041334,0.021714,-0.027587,0.014631,-0.061495
12,30ac0189-560d-43e4-9f19-7a10a7b4d280,-0.054434,0.029956,0.041951,-0.020830,-0.015764,-0.016971,0.058100,0.057174,0.042593,...,0.018830,0.058056,0.057924,-0.037436,0.057559,0.040844,-0.002594,-0.057416,0.006976,-0.049634
14,1acc164f-e3cd-4d18-96e0-8c2c892e20b0,0.026161,0.053684,0.028029,-0.020362,-0.042610,-0.007072,0.055109,0.011830,-0.045824,...,0.019708,0.064259,0.054968,-0.011921,0.024722,-0.030965,-0.001833,-0.042196,-0.011000,-0.060460
18,5d8c6b11-7631-11e4-a3cb-005056011aef,0.014118,0.036567,0.041868,0.050089,-0.057097,-0.020924,0.068061,-0.023364,-0.035928,...,0.053999,0.061361,0.019427,-0.050095,0.053190,-0.027469,-0.005423,-0.021927,0.033389,-0.066678
19,5efa14bb-7631-11e4-a3cb-005056011aef,0.017697,-0.022237,0.040489,0.054718,-0.052823,-0.001287,-0.037791,-0.037548,-0.001121,...,0.058050,0.062515,-0.036483,0.039584,0.025898,-0.038180,-0.025607,-0.013492,-0.007378,-0.023212
23,874a177d-a5a7-41d9-8302-c087d13be958,-0.013949,-0.060453,0.036274,0.002869,-0.018805,-0.069322,-0.024640,-0.029759,-0.023275,...,0.008346,0.071064,-0.039858,-0.004848,0.013203,-0.056589,0.011447,-0.031708,-0.010632,-0.016936
24,9f499637-069e-4810-ac65-2430782a5ec0,-0.020951,0.003924,0.041680,0.066609,0.035343,-0.052870,-0.025021,0.061031,0.000071,...,0.041768,0.056079,0.051827,-0.033013,0.051987,-0.035633,0.027365,0.026279,-0.038545,0.026687


## 4. Save to file

In [58]:
predict_test.shape, node_data.shape

((10301850, 7), (206038, 513))

In [59]:
predict_test.head()

Unnamed: 0,src_cid,dest_cid,src_node,dest_node,source,target,label
0,b220a437-0d51-4390-9993-63345d0c83ad,a4481bd0-8986-4a85-a445-44f10f715349,/sign-in-universal-credit,/employment-tribunal-decisions/mr-m-gurney-v-b...,5947,255142,go_to
1,b220a437-0d51-4390-9993-63345d0c83ad,953e4007-b6ce-4d8d-aa20-05a34c3e3bf4,/sign-in-universal-credit,/government/news/moj-unit-in-crackdown-on-rogu...,5947,121560,go_to
2,b220a437-0d51-4390-9993-63345d0c83ad,44997587-823b-4663-be55-d5636e406828,/sign-in-universal-credit,/aaib-reports/piper-pa-38-112-tomahawk-g-bjur-...,5947,137263,go_to
3,b220a437-0d51-4390-9993-63345d0c83ad,c4b30e72-0311-466f-87a5-35900fb69dda,/sign-in-universal-credit,/guidance/veterinary-medicines-summary-of-prod...,5947,138723,go_to
4,b220a437-0d51-4390-9993-63345d0c83ad,4ced5048-608f-4729-96a2-9b1cb12cea19,/sign-in-universal-credit,/government/statistics/uk-house-price-index-oc...,5947,229385,go_to


In [63]:
predict_test.shape

((10301850, 7), 14821226)

In [64]:
predict_test.target.max()

255642

In [66]:
PREDICT_DIR = os.path.join(DATA_DIR, "predict_network")
predict_file = os.path.join(PREDICT_DIR, "predict_top50_vs_all.csv.gz")
node_data_file = os.path.join(PREDICT_DIR, "predict_top50_vs_all_node_data_embs_300.csv.gz")

In [None]:
predict_test.to_csv(predict_file, index=False, compression="gzip")
node_data.to_csv(node_data_file, compression="gzip")

In [None]:
# # Double check
# n2 = pd.read_csv(node_data_file, index_col=0)
# n2.head()