In [1]:
import networkx as nx
import pandas as pd
import json
import os
from networkx.readwrite import json_graph
import numpy as np
from datetime import datetime
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from ast import literal_eval
from gensim.models import Word2Vec

### Directories

In [2]:
DATA_DIR = os.getenv("DATA_DIR")
train_network = os.path.join(DATA_DIR, "train_network")
models = os.path.join(os.path.dirname(DATA_DIR), "models")
content_api = os.path.join(DATA_DIR, "content_api")
os.listdir(train_network)

['merged_edges_struct_funct.csv.gz', 'merged_embs_struct_funct.csv.gz']

In [3]:
edgefile = os.path.join(train_network, "merged_edges_struct_funct.csv.gz")
edges = pd.read_csv(edgefile, compression="gzip")

In [4]:
# emb_file = os.path.join(train_network, 'merged_embs_struct_funct.csv.gz')
# embeddings = pd.read_csv(emb_file, compression="gzip", index_col=0)

In [5]:
edges.head()

Unnamed: 0,dest_cid,dest_node,source,src_cid,src_node,target,weight,edge-id
0,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,33265,5ef7560d-7631-11e4-a3cb-005056011aef,/government/publications/guidance-for-dependan...,58314,66.0,33265-58314
1,aa055bd6-fde1-4471-8ac2-ac86241b4898,/find-a-visa-application-centre,58314,1e333395-5dd5-4452-96a3-fbe939928761,/visa-fees,12343,1164.0,58314-12343
2,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,23720,d612c61e-22f4-4922-8bb2-b04b9202126e,/entering-staying-uk/family-visas,25053,377.0,23720-25053
3,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/partner-spouse,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,14044,26227.0,25053-14044
4,5ef421be-7631-11e4-a3cb-005056011aef,/government/publications/application-for-uk-vi...,14044,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/partner-spouse,33797,2733.0,14044-33797


### Initialize `base_path`,  `content_id` and `feature` dictionaries

In [6]:
def build_dict(k1,v1,k2,v2):
    agg_dict = dict(zip(k1,v1))
    for key,value in zip(k2,v2):
        if key not in agg_dict.keys():
            agg_dict[key] = value
    return agg_dict   

In [7]:
nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)
url_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

In [None]:
# cols = list(embeddings.columns[1:].values)
# nid_emb = dict(zip(embeddings.index, embeddings[cols].values))
# nid_emb[33265][0:2]

## Set up digraph and add node attributes 
#### attributes are more relevant to future graphSAGE work

In [9]:
graph = nx.DiGraph()
for src,dest in zip(edges.source, edges.target):
    graph.add_edge(src, dest)
#     graph[src][dest].update({'test_removed': False, 'train_removed': False})

In [13]:
# >>> attrs = {0: {'attr1': 20, 'attr2': 'nothing'}, 1: {'attr2': 3}}
# >>> nx.set_node_attributes(G, attrs)
# "feature":nid_emb[nid],
attrs = {nid:{"cid":nid_cid[nid], "url":nid_url[nid]
#               "test":False, "val":False,
#                "label":[] 
              } for nid in graph.nodes()}
nx.set_node_attributes(graph, attrs)

In [15]:
graph.nodes[58314]

{'cid': '1e333395-5dd5-4452-96a3-fbe939928761', 'url': '/visa-fees'}

In [16]:
graph.nodes[58314].keys()

dict_keys(['cid', 'url'])

In [20]:
(33265,-1) in graph.edges

False

## Node2Vec

In [None]:
# Precompute probabilities and generate walks
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=300, workers=1)  

### Compute node embeddings

In [None]:
print(datetime.now().strftime("%H:%M:%S"), "Fitting model...")
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  
# Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are 
# automatically passed (from the Node2Vec constructor)
print(datetime.now().strftime("%H:%M:%S"), "Finished fitting model...")
# Look for most similar nodes
model.wv.most_similar('2', topn=10)  # Output node names are always strings

## Save out resulting node/edge embeddings and model.

In [23]:
EMBEDDING_FILENAME = os.path.join(models, "n2v_node_embeddings")
EMBEDDING_MODEL_FILENAME = os.path.join(models, "n2v.model")
EDGES_EMBEDDING_FILENAME = os.path.join(models, "n2v_edge_embeddings")

In [None]:
# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)
# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

## Check some results, load saved trained model from above

In [25]:
model =  Word2Vec.load(EMBEDDING_MODEL_FILENAME)

In [27]:
str_node = "/sold-bought-vehicle"
# "/sold-bought-vehicle"/government/publications/below-the-radar-low-level-disruption-in-the-countrys-classrooms
target = str(edges[edges.src_node == str_node].source.values[0])
target

'61415'

In [28]:
count = 0
cids = []
for nid, prob in model.wv.most_similar(target, topn=50):
    if nid_cid[int(target)] != nid_cid[int(nid)] and "/browse" not in nid_url[int(nid)]\
    and nid_cid[int(nid)] not in cids:
        print(nid_url[int(nid)])
        count+=1
        cids.append(nid_cid[int(nid)])
    if count == 10:
        break

/make-a-sorn
/vehicle-tax-refund
/vehicle-log-book
/car-tax-disc-without-v11-reminder
/responsibilities-selling-vehicle
/sorn-statutory-off-road-notification
/contact-the-dvla
/government/publications/application-for-a-vehicle-registration-certificate
/check-vehicle-tax
/written-off-vehicle


  if np.issubdtype(vec.dtype, np.int):


## "Predict" for top 50 pages

In [29]:
top = pd.read_csv(os.path.join(DATA_DIR, "top-pages-govuk-feb19.tsv"), sep='\t', usecols=['Page', 'Page Views'])
top['Page'] = top['Page'].map(lambda x: x.replace("https://www.integration.publishing.service.gov.uk",""))
top['Page Views'] = top['Page Views'].map(lambda x: x.replace(',', ''))
top['Page Views'] = top['Page Views'].astype("float")
# top['content_id'] = top['Page'].map(base_cid)

In [30]:
top.head()

Unnamed: 0,Page,Page Views
0,/,10219580.0
1,/search,9619994.0
2,/sign-in-universal-credit,8221833.0
3,/log-in-register-hmrc-online-services,6564918.0
4,/sold-bought-vehicle,4722768.0


### Set up content item titles for human assessment

In [31]:
labelled_file = os.path.join(DATA_DIR, "content_api", "labelled.csv.gz")
labelled = pd.read_csv(labelled_file, compression="gzip")
base_cid = dict(zip(labelled.base_path, labelled.content_id))
cid_title = dict(zip(labelled.content_id, labelled.title))
top["content_id"] = top["Page"].map(base_cid)
top.dropna(subset=["content_id"], inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [32]:
def generate_vectors(vector_list):
    for nid, prob in vector_list:
        yield nid, prob

In [33]:
vecs = generate_vectors(model.wv.most_similar("0", topn=1000))

In [34]:
nid, prob = next(vecs)
nid, prob

('22461', 0.8493591547012329)

## Compute related links for a set of pages

In [None]:
cids_edgeset = set(list(edges.src_cid)+list(edges.dest_cid))

In [None]:
nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)
url_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

In [None]:
def compute_top_n(df_path, n):
    pages_links = []
    missing = []
    for page in df_path.values:
        if page in url_nid.keys():
            target = str(url_nid[page])
            count = 0
            cids = []
            vecs = generate_vectors(model.wv.most_similar(target, topn=1000))
            while count <= n:
                nid, prob = next(vecs)
                if nid_cid[int(target)] != nid_cid[int(nid)]\
                and all(t not in nid_url[int(nid)] for t in ["/topic","/browse"])\
                and nid_cid[int(nid)] not in cids\
                and nid_cid[int(nid)] not in cid_link_cids[nid_cid[int(target)]]:
                    
                    cids.append(nid_cid[int(nid)])
                    page_link = {"nid":int(target),
                                 "cid":nid_cid[int(target)],
                                 "base_path": page, 
                                 "link":nid_url[int(nid)], 
                                 "link_cid":nid_cid[int(nid)],
                                 "probability":round(prob,3)}
                    pages_links.append(page_link)
                    count+=1
        else:
            missing.append(page)
#             print("Page {} is missing from training set".format(page))
            
    return pd.DataFrame(pages_links), missing

In [None]:
top_50_links, misses = compute_top_n(content.base_path, 10)

In [None]:
top_50_links.shape, len(misses), len(set(content.base_path.values))-len(misses)

In [None]:
top_50_links[top_50_links.base_path == "/universal-credit"]

In [None]:
top_50_links['title'] = top_50_links['cid'].map(cid_title)
top_50_links['link_title'] = top_50_links['link_cid'].map(cid_title)


In [None]:
top_50_links.to_csv(os.path.join(DATA_DIR, "results",\
                                 "n2v_struct_funct_results.csv"), index=False)

### Compute edge embeddings

In [None]:
# Embed edges using Hadamard method
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
# Lookup embeddings
edges_embs[('1', '2')]

In [None]:
# Get all edges in a separate KeyedVectors instance - use with caution 5817375180 edge features...
edges_kv = edges_embs.as_keyed_vectors()
# Look for most similar edges - this time tuples must be sorted and as str
edges_kv.most_similar(str(('1', '2')))