In [2]:
import networkx as nx
import pandas as pd
import json
import os
from networkx.readwrite import json_graph
import numpy as np
from datetime import datetime
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from ast import literal_eval
from gensim.models import Word2Vec

In [9]:
### Directories

DATA_DIR = os.getenv("DATA_DIR")
train_network = os.path.join(DATA_DIR, "train_network")
models = os.path.join(os.path.dirname(DATA_DIR), "models")
content_api = os.path.join(DATA_DIR, "content_api")
os.listdir(train_network)

edgefile = os.path.join(train_network, "merged_edges_struct_funct_w_brexit.csv.gz")
edges = pd.read_csv(edgefile, compression="gzip", sep='\t')

In [10]:
edges.head()

Unnamed: 0,dest_cid,dest_node,edge-id,source,src_cid,src_node,target,weight
0,b171ada1-44a8-4221-a312-b2cbaab00249,/guidance/passport-rules-for-travel-to-europe-...,27924-27924,27924,b171ada1-44a8-4221-a312-b2cbaab00249,/guidance/passport-rules-for-travel-to-europe-...,27924,62934.0
1,92f7d5cd-2618-4306-ba2e-5874e2302061,/guidance/uk-nationals-travelling-to-eu-essent...,18961-18961,18961,92f7d5cd-2618-4306-ba2e-5874e2302061,/guidance/uk-nationals-travelling-to-eu-essent...,18961,29876.0
2,9f1621c2-1d3c-4a91-ad71-5366d3270dae,/guidance/prepare-to-drive-in-the-eu-after-brexit,42091-42091,42091,9f1621c2-1d3c-4a91-ad71-5366d3270dae,/guidance/prepare-to-drive-in-the-eu-after-brexit,42091,31930.0
3,91504a10-f697-42a3-a779-f238b4955ea9,/settled-status-eu-citizens-families,55850-55850,55850,91504a10-f697-42a3-a779-f238b4955ea9,/settled-status-eu-citizens-families,55850,34379.0
4,e675301d-b593-4736-b216-bb9369edf04d,/eu-eea,35999-35999,35999,e675301d-b593-4736-b216-bb9369edf04d,/eu-eea,35999,11399.0


In [11]:
### Initialize `base_path`,  `content_id` and `feature` dictionaries

In [12]:
def build_dict(k1,v1,k2,v2):
    agg_dict = dict(zip(k1,v1))
    for key,value in zip(k2,v2):
        if key not in agg_dict.keys():
            agg_dict[key] = value
    return agg_dict   

In [13]:
nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)
url_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

In [14]:
graph = nx.DiGraph()
for src,dest in zip(edges.source, edges.target):
    graph.add_edge(src, dest)

attrs = {nid:{"cid":nid_cid[nid], "url":nid_url[nid]} for nid in graph.nodes()}
nx.set_node_attributes(graph, attrs)

In [15]:
graph.nodes[58314]

{'cid': '1e333395-5dd5-4452-96a3-fbe939928761', 'url': '/visa-fees'}

In [16]:
graph.nodes[58314].keys()

dict_keys(['cid', 'url'])

In [17]:
## Node2Vec

In [None]:
# Precompute probabilities and generate walks
print(datetime.now().strftime("%H:%M:%S"), "Computing probs and walks...")
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=1)  
print(datetime.now().strftime("%H:%M:%S"), "End")

Computing transition probabilities:   0%|          | 0/144496 [00:00<?, ?it/s]

23:32:29 Computing probs and walks...


Computing transition probabilities: 100%|██████████| 144496/144496 [10:02<00:00, 239.63it/s]
Generating walks (CPU: 1):  16%|█▌        | 31/200 [33:30<3:11:45, 68.08s/it]

In [None]:
### Compute node embeddings

In [None]:
print(datetime.now().strftime("%H:%M:%S"), "Fitting model...")
model = node2vec.fit(window=10, min_count=1, batch_words=4)  
print(datetime.now().strftime("%H:%M:%S"), "Finished fitting model...")

In [None]:
# Look for most similar nodes
model.wv.most_similar('2', topn=10)  # Output node names are always strings

In [None]:
str_node = "/sold-bought-vehicle"
# "/sold-bought-vehicle"/government/publications/below-the-radar-low-level-disruption-in-the-countrys-classrooms
target = str(edges[edges.src_node == str_node].source.values[0])
target

In [None]:
count = 0
cids = []
for nid, prob in model.wv.most_similar(target, topn=50):
    if nid_cid[int(target)] != nid_cid[int(nid)] and "/browse" not in nid_url[int(nid)]\
    and nid_cid[int(nid)] not in cids:
        print(nid_url[int(nid)])
        count+=1
        cids.append(nid_cid[int(nid)])
    if count == 10:
        break

In [None]:
## "Predict" for relevant brexit pages

top = pd.read_csv(os.path.join(DATA_DIR, "top-pages-govuk-feb19.tsv"), sep='\t', usecols=['Page', 'Page Views'])

top.head()

In [None]:
# ### Set up content item titles for human assessment
# labelled_file = os.path.join(DATA_DIR, "content_api", "labelled.csv.gz")
# labelled = pd.read_csv(labelled_file, compression="gzip")
# base_cid = dict(zip(labelled.base_path, labelled.content_id))
# cid_title = dict(zip(labelled.content_id, labelled.title))
# top["content_id"] = top["Page"].map(base_cid)
# top.dropna(subset=["content_id"], inplace=True)

In [None]:
def generate_vectors(vector_list):
    for nid, prob in vector_list:
        yield nid, prob

In [None]:
vecs = generate_vectors(model.wv.most_similar("0", topn=1000))

In [None]:
nid, prob = next(vecs)
nid, prob

In [None]:
## Compute related links for a set of pages

In [None]:
cids_edgeset = set(list(edges.src_cid)+list(edges.dest_cid))

nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)
url_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

In [None]:
def compute_top_n(df_path, n):
    pages_links = []
    missing = []
    for page in df_path.values:
        if page in url_nid.keys():
            target = str(url_nid[page])
            count = 0
            cids = []
            vecs = generate_vectors(model.wv.most_similar(target, topn=1000))
            while count <= n:
                nid, prob = next(vecs)
                if nid_cid[int(target)] != nid_cid[int(nid)]\
                and all(t not in nid_url[int(nid)] for t in ["/topic","/browse"])\
                and nid_cid[int(nid)] not in cids\
                and nid_cid[int(nid)] not in cid_link_cids[nid_cid[int(target)]]:
                    
                    cids.append(nid_cid[int(nid)])
                    page_link = {"nid":int(target),
                                 "cid":nid_cid[int(target)],
                                 "base_path": page, 
                                 "link":nid_url[int(nid)], 
                                 "link_cid":nid_cid[int(nid)],
                                 "probability":round(prob,3)}
                    pages_links.append(page_link)
                    count+=1
        else:
            missing.append(page)
#             print("Page {} is missing from training set".format(page))
            
    return pd.DataFrame(pages_links), missing

In [None]:
top_50_links, misses = compute_top_n(content.base_path, 10)

top_50_links.shape, len(misses), len(set(content.base_path.values))-len(misses)

top_50_links[top_50_links.base_path == "/universal-credit"]

top_50_links['title'] = top_50_links['cid'].map(cid_title)
top_50_links['link_title'] = top_50_links['link_cid'].map(cid_title)


top_50_links.to_csv(os.path.join(DATA_DIR, "results",\
                                 "n2v_struct_funct_results_bre.csv"), index=False)

In [None]:
## Save out resulting node/edge embeddings and model.

In [None]:
EMBEDDING_FILENAME = os.path.join(models, "n2v_node_embeddings")
EMBEDDING_MODEL_FILENAME = os.path.join(models, "n2v.model")
EDGES_EMBEDDING_FILENAME = os.path.join(models, "n2v_edge_embeddings")

In [None]:
model =  Word2Vec.load(EMBEDDING_MODEL_FILENAME)

In [None]:
# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)
# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

## Check some results, load saved trained model from above