### **TODO** required graphSAGE inputs

    <train_prefix>-G.json -- A networkx-specified json file describing the input graph. Nodes have 'val' and 'test' attributes specifying if they are a part of the validation and test sets, respectively.
    <train_prefix>-id_map.json -- A json-stored dictionary mapping the graph node ids to consecutive integers.
    <train_prefix>-class_map.json -- A json-stored dictionary mapping the graph node ids to classes.
    <train_prefix>-feats.npy [optional] --- A numpy-stored array of node features; ordering given by id_map.json. Can be omitted and only identity features will be used.
    <train_prefix>-walks.txt [optional] --- A text file specifying random walk co-occurrences (one pair per line) (*only for unsupervised version of graphsage)

In [18]:
import networkx as nx
import pandas as pd
import json
import os
from networkx.readwrite import json_graph
import numpy as np
from datetime import datetime
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder

### Directories

In [97]:
DATA_DIR = os.getenv("DATA_DIR")
train_network = os.path.join(DATA_DIR, "train_network")
models = os.path.join(os.path.dirname(DATA_DIR), "models")
os.listdir(train_network)

['merged_edges_struct_funct.csv.gz', 'merged_embs_struct_funct.csv.gz']

In [6]:
edgefile = os.path.join(train_network, "merged_edges_struct_funct.csv.gz")
edges = pd.read_csv(edgefile, compression="gzip")

In [7]:
emb_file = os.path.join(train_network, 'merged_embs_struct_funct.csv.gz')
embeddings = pd.read_csv(emb_file, compression="gzip", index_col=0)

In [8]:
edges.head()

Unnamed: 0,dest_cid,dest_node,source,src_cid,src_node,target,weight,edge-id
0,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,33265,5ef7560d-7631-11e4-a3cb-005056011aef,/government/publications/guidance-for-dependan...,58314,66.0,33265-58314
1,aa055bd6-fde1-4471-8ac2-ac86241b4898,/find-a-visa-application-centre,58314,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,12343,1164.0,58314-12343
2,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,23720,d612c61e-22f4-4922-8bb2-b04b9202126e,/entering-staying-uk/family-visas,25053,377.0,23720-25053
3,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/partner-spouse,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,14044,26227.0,25053-14044
4,5ef421be-7631-11e4-a3cb-005056011aef,/government/publications/application-for-uk-vi...,14044,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/partner-spouse,33797,2733.0,14044-33797


### Initialize `base_path`,  `content_id` and `feature` dictionaries

In [9]:
def build_dict(k1,v1,k2,v2):
    agg_dict = dict(zip(k1,v1))
    for key,value in zip(k2,v2):
        if key not in agg_dict.keys():
            agg_dict[key] = value
    return agg_dict   

In [10]:
nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)

In [11]:
cols = list(embeddings.columns[1:].values)
nid_emb = dict(zip(embeddings.index, embeddings[cols].values))
nid_emb[33265][0:2]

array([-0.04767632, -0.02763887])

## Set up digraph and add node attributes 
#### attributes are more relevant to future graphSAGE work

In [12]:
graph = nx.DiGraph()
for src,dest in zip(edges.source, edges.target):
    graph.add_edge(src, dest)
    graph[src][dest].update({'test_removed': False, 'train_removed': False})

In [13]:
# >>> attrs = {0: {'attr1': 20, 'attr2': 'nothing'}, 1: {'attr2': 3}}
# >>> nx.set_node_attributes(G, attrs)
attrs = {nid:{"cid":nid_cid[nid], "url":nid_url[nid], 
              "test":False, "val":False,
              "feature":nid_emb[nid], "label":[] 
              } for nid in graph.nodes()}
nx.set_node_attributes(graph, attrs)

In [16]:
graph.nodes[0].keys()

dict_keys(['cid', 'url', 'test', 'val', 'feature', 'label'])

## Node2Vec

In [19]:
# Precompute probabilities and generate walks
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=1)  

Computing transition probabilities: 100%|██████████| 107864/107864 [02:13<00:00, 810.46it/s]
Generating walks (CPU: 1): 100%|██████████| 200/200 [1:28:02<00:00, 23.85s/it]


### Compute node embeddings

In [21]:
print(datetime.now().strftime("%H:%M:%S"), "Fitting model...")
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  
# Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are 
# automatically passed (from the Node2Vec constructor)
print(datetime.now().strftime("%H:%M:%S"), "Finished fitting model...")
# Look for most similar nodes
model.wv.most_similar('2', topn=10)  # Output node names are always strings

17:00:18 Fitting model...
18:42:56 Finished fitting model...


  if np.issubdtype(vec.dtype, np.int):


[('38738', 0.9719204902648926),
 ('20257', 0.9458978176116943),
 ('8027', 0.9433640837669373),
 ('48555', 0.9422976970672607),
 ('63512', 0.942084789276123),
 ('64787', 0.9377864599227905),
 ('5506', 0.9350687265396118),
 ('18689', 0.9344114661216736),
 ('4854', 0.9334792494773865),
 ('43991', 0.9326239228248596)]

### Map `base_paths` to `node_ids` for later

In [62]:
base_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

### check some results

In [60]:
target = str(edges[edges.src_node == "/sold-bought-vehicle"].source.values[0])
target

'61415'

In [None]:
count = 0
cids = []
for nid, prob in model.wv.most_similar(target, topn=50):
    if nid_cid[int(target)] != nid_cid[int(nid)] and "/browse" not in nid_url[int(nid)]\
    and nid_cid[int(nid)] not in cids:
        print(nid_url[int(nid)])
        count+=1
        cids.append(nid_cid[int(nid)])
    if count == 10:
        break

## "Predict" for top 50 pages

In [55]:
top = pd.read_csv(os.path.join(DATA_DIR, "top-pages-govuk-feb19.tsv"), sep='\t', usecols=['Page', 'Page Views'])
top['Page'] = top['Page'].map(lambda x: x.replace("https://www.integration.publishing.service.gov.uk",""))
top['Page Views'] = top['Page Views'].map(lambda x: x.replace(',', ''))
top['Page Views'] = top['Page Views'].astype("float")
# top['content_id'] = top['Page'].map(base_cid)

In [56]:
top.head()

Unnamed: 0,Page,Page Views
0,/,10219580.0
1,/search,9619994.0
2,/sign-in-universal-credit,8221833.0
3,/log-in-register-hmrc-online-services,6564918.0
4,/sold-bought-vehicle,4722768.0


In [70]:
labelled_file = os.path.join(DATA_DIR, "content_api", "labelled.csv.gz")
labelled = pd.read_csv(labelled_file, compression="gzip")
base_cid = dict(zip(labelled.base_path, labelled.content_id))
cid_title = dict(zip(labelled.content_id, labelled.title))
top["content_id"] = top["Page"].map(base_cid)
top.dropna(subset=["content_id"], inplace=True)

In [85]:
def compute_top_50(df):
    pages_links = []
    for page in df.Page.values:
        if page in base_nid.keys():
            target = str(base_nid[page])
            count = 0
            cids = []
            for nid, prob in model.wv.most_similar(target, topn=50):
                if nid_cid[int(target)] != nid_cid[int(nid)] and "/browse" not in nid_url[int(nid)]\
                and nid_cid[int(nid)] not in cids:
#                     print(nid_url[int(nid)])
                    count+=1
                    cids.append(nid_cid[int(nid)])
                    page_link = {"nid":int(target),
                                 "cid":nid_cid[int(target)],
                                 "base_path": page, 
                                 "link":nid_url[int(nid)], 
                                 "link_cid":nid_cid[int(nid)],
                                 "probability":round(prob,3)}
                    pages_links.append(page_link)
                if count == 10:
                    break 
    return pages_links
top_50_links = pd.DataFrame(compute_top_50(top))

In [90]:
top_50_links['title'] = top_50_links['cid'].map(cid_title)
top_50_links['link_title'] = top_50_links['link_cid'].map(cid_title)

In [92]:
top_50_links.to_csv(os.path.join(DATA_DIR, "results", "n2v_struct_funct_results_top50.csv"), index=False)

### Compute edge embeddings

In [None]:
# Embed edges using Hadamard method
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)

In [None]:
# Look for embeddings on the fly - here we pass normal tuples
edges_embs[('1', '2')]

In [None]:
# Get all edges in a separate KeyedVectors instance - use with caution could be huge for big networks
edges_kv = edges_embs.as_keyed_vectors()
# Look for most similar edges - this time tuples must be sorted and as str
edges_kv.most_similar(str(('1', '2')))

## Save out resulting node/edge embeddings and model.

In [98]:
EMBEDDING_FILENAME = os.path.join(models, "n2v_node_embeddings")
EMBEDDING_MODEL_FILENAME = os.path.join(models, "n2v.model")
EDGES_EMBEDDING_FILENAME = os.path.join(models, "n2v_edge_embeddings")

In [99]:
# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)
# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)