### **TODO** required graphSAGE inputs

    <train_prefix>-G.json -- A networkx-specified json file describing the input graph. Nodes have 'val' and 'test' attributes specifying if they are a part of the validation and test sets, respectively.
    <train_prefix>-id_map.json -- A json-stored dictionary mapping the graph node ids to consecutive integers.
    <train_prefix>-class_map.json -- A json-stored dictionary mapping the graph node ids to classes.
    <train_prefix>-feats.npy [optional] --- A numpy-stored array of node features; ordering given by id_map.json. Can be omitted and only identity features will be used.
    <train_prefix>-walks.txt [optional] --- A text file specifying random walk co-occurrences (one pair per line) (*only for unsupervised version of graphsage)

In [66]:
import networkx as nx
import pandas as pd
import json
import os
from networkx.readwrite import json_graph
import numpy as np
from datetime import datetime
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from ast import literal_eval
from gensim.models import Word2Vec

### Directories

In [2]:
DATA_DIR = os.getenv("DATA_DIR")
train_network = os.path.join(DATA_DIR, "train_network")
models = os.path.join(os.path.dirname(DATA_DIR), "models")
content_api = os.path.join(DATA_DIR, "content_api")
os.listdir(train_network)

['merged_edges_struct_funct.csv.gz', 'merged_embs_struct_funct.csv.gz']

In [3]:
edgefile = os.path.join(train_network, "merged_edges_struct_funct.csv.gz")
edges = pd.read_csv(edgefile, compression="gzip")

In [4]:
emb_file = os.path.join(train_network, 'merged_embs_struct_funct.csv.gz')
# embeddings = pd.read_csv(emb_file, compression="gzip", index_col=0)

In [5]:
edges.head()

Unnamed: 0,dest_cid,dest_node,source,src_cid,src_node,target,weight,edge-id
0,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,33265,5ef7560d-7631-11e4-a3cb-005056011aef,/government/publications/guidance-for-dependan...,58314,66.0,33265-58314
1,aa055bd6-fde1-4471-8ac2-ac86241b4898,/find-a-visa-application-centre,58314,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,12343,1164.0,58314-12343
2,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,23720,d612c61e-22f4-4922-8bb2-b04b9202126e,/entering-staying-uk/family-visas,25053,377.0,23720-25053
3,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/partner-spouse,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,14044,26227.0,25053-14044
4,5ef421be-7631-11e4-a3cb-005056011aef,/government/publications/application-for-uk-vi...,14044,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/partner-spouse,33797,2733.0,14044-33797


## Set up content.json extract for related links prediction filtering 
ie to excude embedded links

In [6]:
content_json_file = os.path.join(content_api, "11-02-19", "content_json_extended.csv.gz")
content = pd.read_csv(content_json_file, compression="gzip")

In [7]:
content.columns

Index(['base_path', 'body', 'children', 'collection_links', 'content_id',
       'description', 'document_collections', 'document_type',
       'embedded_links', 'field_of_operation', 'finder', 'first_published_at',
       'lead_organisations', 'locale', 'mainstream_browse_pages', 'ministers',
       'ordered_related_items_overrides', 'organisations',
       'pages_part_of_step_nav', 'pages_related_to_step_nav', 'parent',
       'part_of_step_navs', 'people', 'policy_areas',
       'primary_publishing_organisation', 'publishing_app', 'related_guides',
       'related_links', 'related_mainstream', 'related_mainstream_content',
       'related_policies', 'related_statistical_data_sets',
       'related_to_step_navs', 'roles', 'sections', 'speaker', 'title',
       'topical_events', 'topics'],
      dtype='object')

In [8]:
content_base_cid = dict(zip(content.base_path, content.content_id))

In [9]:
content.embedded_links.iloc[0]

"['/government/uploads/system/uploads/attachment_data/file/631646/List_of_Psychologists_and_Psychiatrists_2017.pdf']"

In [13]:
content['embedded_links'] = content['embedded_links'].map(literal_eval)

In [14]:
content['related_links'] = content['related_links'].map(literal_eval)

In [15]:
content['embedded_cids'] = content['embedded_links'].map(lambda x: [content_base_cid[l] for l in x 
                                                                    if l in content_base_cid.keys()])
content['related_cids'] = content['related_links'].map(lambda x: [content_base_cid[l] for l in x 
                                                                    if l in content_base_cid.keys()])

In [16]:
cid_link_cids = dict(zip(content.content_id, content.embedded_cids))

### Initialize `base_path`,  `content_id` and `feature` dictionaries

In [17]:
def build_dict(k1,v1,k2,v2):
    agg_dict = dict(zip(k1,v1))
    for key,value in zip(k2,v2):
        if key not in agg_dict.keys():
            agg_dict[key] = value
    return agg_dict   

In [18]:
nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)
url_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

In [20]:
# cols = list(embeddings.columns[1:].values)
# nid_emb = dict(zip(embeddings.index, embeddings[cols].values))
# nid_emb[33265][0:2]

## Add nodes from `content.json` to edgelist

In [21]:
edges.columns

Index(['dest_cid', 'dest_node', 'source', 'src_cid', 'src_node', 'target',
       'weight', 'edge-id'],
      dtype='object')

In [22]:
content.columns

Index(['base_path', 'body', 'children', 'collection_links', 'content_id',
       'description', 'document_collections', 'document_type',
       'embedded_links', 'field_of_operation', 'finder', 'first_published_at',
       'lead_organisations', 'locale', 'mainstream_browse_pages', 'ministers',
       'ordered_related_items_overrides', 'organisations',
       'pages_part_of_step_nav', 'pages_related_to_step_nav', 'parent',
       'part_of_step_navs', 'people', 'policy_areas',
       'primary_publishing_organisation', 'publishing_app', 'related_guides',
       'related_links', 'related_mainstream', 'related_mainstream_content',
       'related_policies', 'related_statistical_data_sets',
       'related_to_step_navs', 'roles', 'sections', 'speaker', 'title',
       'topical_events', 'topics', 'embedded_cids', 'related_cids'],
      dtype='object')

In [23]:
max_node = max(nid_cid.keys())+1
max_node

107864

In [24]:
def get_nid(base_path):
    if base_path in url_nid.keys():
        return url_nid[base_path]
    else:
        max_node = max(nid_url.keys())+1
        url_nid[base_path] = max_node
        nid_url[max_node] = base_path
        return url_nid[base_path]

In [25]:
link_list = ['related_links', 'embedded_links', 'collection_links',\
             'pages_part_of_step_nav', 'pages_related_to_step_nav', 'related_guides', 'related_mainstream']

In [26]:
for item in link_list[2:]:
    print(item)
    content[item] = content[item].map(literal_eval)

collection_links
pages_part_of_step_nav
pages_related_to_step_nav
related_guides
related_mainstream


In [27]:
content[content.content_id.isna()].shape

(0, 41)

In [28]:
rowlist = []
for tup in content.itertuples(index=False):
    for l in link_list:
        tup_ind = content.columns.get_loc(l)
        for link in tup[tup_ind]:
            if link in content_base_cid.keys():
                row = {}
                row['src_cid'] = tup.content_id
                row['dest_cid'] = content_base_cid[link]
                row['dest_node'] = link
                row['src_node'] = tup.base_path
                row['source'] = get_nid(tup.base_path)
                row['target'] = get_nid(link)
                row['weight'] = 0
                row['edge-id'] = ""
                rowlist.append(row)

In [29]:
content_edges = pd.DataFrame(rowlist)

In [30]:
edges[edges.source==30443].src_node.iloc[0]

'/company-voluntary-arrangements'

In [31]:
max(url_nid.values())

118158

In [32]:
merged_edges = pd.concat([edges,content_edges],sort=True)

In [33]:
merged_edges.head(2)

Unnamed: 0,dest_cid,dest_node,edge-id,source,src_cid,src_node,target,weight
0,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,33265-58314,33265,5ef7560d-7631-11e4-a3cb-005056011aef,/government/publications/guidance-for-dependan...,58314,66.0
1,aa055bd6-fde1-4471-8ac2-ac86241b4898,/find-a-visa-application-centre,58314-12343,58314,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,12343,1164.0


In [34]:
nid_cid = build_dict(merged_edges.source, merged_edges.src_cid, merged_edges.target, merged_edges.dest_cid)
nid_url = build_dict(merged_edges.source, merged_edges.src_node, merged_edges.target, merged_edges.dest_node)
url_nid = build_dict(merged_edges.src_node, merged_edges.source, merged_edges.dest_node, merged_edges.target)

In [35]:
nid_cid[33265]

'5ef7560d-7631-11e4-a3cb-005056011aef'

In [36]:
edges[edges.source==33265].iloc[0]

dest_cid                  1e333395-5dd5-4452-96a3-fbe939928761
dest_node                                           /visa-fees
source                                                   33265
src_cid                   5ef7560d-7631-11e4-a3cb-005056011aef
src_node     /government/publications/guidance-for-dependan...
target                                                   58314
weight                                                      66
edge-id                                            33265-58314
Name: 0, dtype: object

## Set up digraph and add node attributes 
#### attributes are more relevant to future graphSAGE work

In [37]:
graph = nx.DiGraph()
for src,dest in zip(merged_edges.source, merged_edges.target):
    graph.add_edge(src, dest)
    graph[src][dest].update({'test_removed': False, 'train_removed': False})

In [38]:
# >>> attrs = {0: {'attr1': 20, 'attr2': 'nothing'}, 1: {'attr2': 3}}
# >>> nx.set_node_attributes(G, attrs)
# "feature":nid_emb[nid],
attrs = {nid:{"cid":nid_cid[nid], "url":nid_url[nid], 
              "test":False, "val":False,
               "label":[] 
              } for nid in graph.nodes()}
nx.set_node_attributes(graph, attrs)

In [39]:
graph.nodes[0].keys()

dict_keys(['cid', 'url', 'test', 'val', 'label'])

## Node2Vec

In [40]:
# Precompute probabilities and generate walks
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=300, workers=1)  

Computing transition probabilities: 100%|██████████| 118159/118159 [01:57<00:00, 1008.29it/s]
Generating walks (CPU: 1): 100%|██████████| 300/300 [2:05:05<00:00, 24.59s/it]  


### Compute node embeddings

In [41]:
print(datetime.now().strftime("%H:%M:%S"), "Fitting model...")
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  
# Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are 
# automatically passed (from the Node2Vec constructor)
print(datetime.now().strftime("%H:%M:%S"), "Finished fitting model...")
# Look for most similar nodes
model.wv.most_similar('2', topn=10)  # Output node names are always strings

01:07:25 Fitting model...
10:36:00 Finished fitting model...


  if np.issubdtype(vec.dtype, np.int):


[('38738', 0.9659421443939209),
 ('11131', 0.9386969208717346),
 ('16608', 0.9371576905250549),
 ('31388', 0.9371330738067627),
 ('9904', 0.9369335174560547),
 ('22485', 0.9360983371734619),
 ('65693', 0.9349657893180847),
 ('48555', 0.9337107539176941),
 ('5506', 0.9334402680397034),
 ('23939', 0.933233380317688)]

### check some results

In [44]:
str_node = "/government/publications/below-the-radar-low-level-disruption-in-the-countrys-classrooms"
# "/sold-bought-vehicle"
target = str(edges[edges.src_node == str_node].source.values[0])
target

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
count = 0
cids = []
for nid, prob in model.wv.most_similar(target, topn=50):
    if nid_cid[int(target)] != nid_cid[int(nid)] and "/browse" not in nid_url[int(nid)]\
    and nid_cid[int(nid)] not in cids:
        print(nid_url[int(nid)])
        count+=1
        cids.append(nid_cid[int(nid)])
    if count == 10:
        break

## "Predict" for top 50 pages

In [None]:
top = pd.read_csv(os.path.join(DATA_DIR, "top-pages-govuk-feb19.tsv"), sep='\t', usecols=['Page', 'Page Views'])
top['Page'] = top['Page'].map(lambda x: x.replace("https://www.integration.publishing.service.gov.uk",""))
top['Page Views'] = top['Page Views'].map(lambda x: x.replace(',', ''))
top['Page Views'] = top['Page Views'].astype("float")
# top['content_id'] = top['Page'].map(base_cid)

In [None]:
top.head()

### Set up content item titles for human assessment

In [None]:
labelled_file = os.path.join(DATA_DIR, "content_api", "labelled.csv.gz")
labelled = pd.read_csv(labelled_file, compression="gzip")
base_cid = dict(zip(labelled.base_path, labelled.content_id))
cid_title = dict(zip(labelled.content_id, labelled.title))
top["content_id"] = top["Page"].map(base_cid)
top.dropna(subset=["content_id"], inplace=True)

In [50]:
def generate_vectors(vector_list):
    for nid, prob in vector_list:
        yield nid, prob

In [51]:
vecs = generate_vectors(model.wv.most_similar("0", topn=1000))

In [52]:
nid, prob = next(vecs)
nid, prob

('22461', 0.8243222236633301)

## Compute related links for a set of pages

In [53]:
cids_edgeset = set(list(edges.src_cid)+list(edges.dest_cid))

In [67]:
EMBEDDING_MODEL_FILENAME = os.path.join(models, "n2v.model")

In [68]:
model =  Word2Vec.load(EMBEDDING_MODEL_FILENAME)

In [71]:
nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)
url_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

In [72]:
def compute_top_n(df_path, n):
    pages_links = []
    missing = []
    for page in df_path.values:
        if page in url_nid.keys():
            target = str(url_nid[page])
            count = 0
            cids = []
            vecs = generate_vectors(model.wv.most_similar(target, topn=1000))
            while count <= n:
                nid, prob = next(vecs)
                if nid_cid[int(target)] != nid_cid[int(nid)]\
                and all(t not in nid_url[int(nid)] for t in ["/topic","/browse"])\
                and nid_cid[int(nid)] not in cids\
                and nid_cid[int(nid)] not in cid_link_cids[nid_cid[int(target)]]:
                    
                    cids.append(nid_cid[int(nid)])
                    page_link = {"nid":int(target),
                                 "cid":nid_cid[int(target)],
                                 "base_path": page, 
                                 "link":nid_url[int(nid)], 
                                 "link_cid":nid_cid[int(nid)],
                                 "probability":round(prob,3)}
                    pages_links.append(page_link)
                    count+=1
        else:
            missing.append(page)
#             print("Page {} is missing from training set".format(page))
            
    return pd.DataFrame(pages_links), missing

In [73]:
top_50_links, misses = compute_top_n(content.base_path, 10)

In [74]:
top_50_links.shape, len(misses), len(set(content.base_path.values))-len(misses)

((611138, 6), 181676, 55558)

In [75]:
top_50_links[top_50_links.base_path == "/universal-credit"]

Unnamed: 0,base_path,cid,link,link_cid,nid,probability
249095,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/income-support,f331c8e2-3163-4230-8471-2b6987cc1eaf,26857,0.831
249096,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/child-tax-credit,a14072ba-e2d1-47fa-93d4-e9cb023a3450,26857,0.799
249097,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/contact-jobcentre-plus/existing-benefit-claims,7f6e97b2-fc2e-4e21-b854-88fcb84d866d,26857,0.786
249098,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/tax-credits-calculator,7a61b192-8edf-4204-8ead-705e5f115c81,26857,0.779
249099,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/how-to-have-your-benefits-paid,f33cbf6f-f27c-4335-ab6f-b5e396f02d56,26857,0.776
249100,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/working-tax-credit,bced106c-3ee7-434c-89dc-0007163b5425,26857,0.771
249101,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/jobseekers-allowance,122fdb0f-471c-4dcc-986b-8c3691ceacf8,26857,0.757
249102,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/housing-benefit,a324ac01-b2da-4bd0-ad41-b33cb37754e8,26857,0.754
249103,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/manage-your-tax-credits,e2048d9b-fae4-45fd-810c-16e5fdb866cb,26857,0.739
249104,/universal-credit,f790dc71-386e-4440-9689-31f94e7ac64d,/guidance/new-style-employment-and-support-all...,ef2842ca-f87b-43d4-9e8e-52cf7f64399d,26857,0.736


In [57]:
top_50_links['title'] = top_50_links['cid'].map(cid_title)
top_50_links['link_title'] = top_50_links['link_cid'].map(cid_title)


NameError: name 'cid_title' is not defined

In [76]:
top_50_links.to_csv(os.path.join(DATA_DIR, "results", "n2v_struct_funct_results.csv"), index=False)


### Compute edge embeddings

In [None]:
# Embed edges using Hadamard method
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
# Lookup embeddings
edges_embs[('1', '2')]

In [None]:
# Get all edges in a separate KeyedVectors instance - use with caution 5817375180 edge features...
edges_kv = edges_embs.as_keyed_vectors()
# Look for most similar edges - this time tuples must be sorted and as str
edges_kv.most_similar(str(('1', '2')))

## Save out resulting node/edge embeddings and model.

In [42]:
EMBEDDING_FILENAME = os.path.join(models, "n2v_node_embeddings_all_content")
EMBEDDING_MODEL_FILENAME = os.path.join(models, "n2v_all_content.model")
EDGES_EMBEDDING_FILENAME = os.path.join(models, "n2v_edge_embeddings_all_content")

In [43]:
# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)
# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)