In [1]:
import networkx as nx
import pandas as pd
import json
import os
from networkx.readwrite import json_graph
import numpy as np
from datetime import datetime
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from ast import literal_eval
from gensim.models import Word2Vec

In [2]:
### Directories

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
train_network = os.path.join(DATA_DIR, "train_network")
models = os.path.join(os.path.dirname(DATA_DIR), "models")
content_api = os.path.join(DATA_DIR, "content_api")
os.listdir(train_network)

edgefile = os.path.join(train_network, "merged_edges_struct_funct.csv.gz")
edges = pd.read_csv(edgefile, compression="gzip")

In [4]:
### Initialize `base_path`,  `content_id` and `feature` dictionaries

def build_dict(k1,v1,k2,v2):
    agg_dict = dict(zip(k1,v1))
    for key,value in zip(k2,v2):
        if key not in agg_dict.keys():
            agg_dict[key] = value
    return agg_dict   

In [5]:
## Save out resulting node/edge embeddings and model.
EMBEDDING_MODEL_FILENAME = os.path.join(models, "n2v.model")

In [6]:
## Check some results, load saved trained model from above

model =  Word2Vec.load(EMBEDDING_MODEL_FILENAME)

In [7]:
str_node = "/sold-bought-vehicle"
# "/sold-bought-vehicle"/government/publications/below-the-radar-low-level-disruption-in-the-countrys-classrooms
target = str(edges[edges.src_node == str_node].source.values[0])
target

'61415'

In [8]:
count = 0
cids = []
for nid, prob in model.wv.most_similar(target, topn=50):
    if nid_cid[int(target)] != nid_cid[int(nid)] and "/browse" not in nid_url[int(nid)]\
    and nid_cid[int(nid)] not in cids:
        print(nid_url[int(nid)])
        count+=1
        cids.append(nid_cid[int(nid)])
    if count == 10:
        break

/make-a-sorn
/vehicle-tax-refund
/vehicle-log-book
/car-tax-disc-without-v11-reminder
/responsibilities-selling-vehicle
/sorn-statutory-off-road-notification
/contact-the-dvla
/government/publications/application-for-a-vehicle-registration-certificate
/check-vehicle-tax
/written-off-vehicle


  if np.issubdtype(vec.dtype, np.int):


## "Predict" for top 50 pages

In [40]:
## Brexit docs
bq = os.path.join(DATA_DIR, "bq_journey_extract")
bfile = os.path.join(bq, "Brexit-related documents - Brexit docs.tsv")
df1 = pd.read_csv(bfile, sep="\t")
df1.columns

Index(['1917', 'Unnamed: 1', 'Last run on: ', '2019-03-31 09:42:51',
       'Unnamed: 4'],
      dtype='object')

In [41]:
df1.drop(['Last run on: ', '2019-03-31 09:42:51',
       'Unnamed: 4'], axis=1, inplace=True)
df1.rename(columns={"1917":"URL","Unnamed: 1":"Pageviews"}, inplace=True)
df1.drop([0,1], inplace=True)
df1 = df1[~df1.URL.str.contains("/print/")]
df1['Pageviews'] = df1['Pageviews'].map(lambda x: x.replace(',', '') 
                                        if isinstance(x,str) else np.nan)
df1['Pageviews'] = df1['Pageviews'].astype("float")
df1.shape

(1562, 2)

In [44]:
df1.Pageviews.describe()

count      1381.000000
mean       2206.220854
std       16089.879061
min           2.000000
25%           6.000000
50%          34.000000
75%         266.000000
max      341728.000000
Name: Pageviews, dtype: float64

In [45]:
df1 = df1[df1.Pageviews>1]
df1.shape

(1381, 2)

In [46]:
df1.head()

Unnamed: 0,URL,Pageviews
2,/guidance/international-driving-permits-for-uk...,341728.0
3,/staying-uk-eu-citizen,311799.0
4,/settled-status-eu-citizens-families,185997.0
5,/prepare-eu-exit,138898.0
6,/settled-status-eu-citizens-families/applying-...,129524.0


In [None]:
### Generate vectors

In [None]:
def generate_vectors(vector_list):
    for nid, prob in vector_list:
        yield nid, prob

In [47]:
## Test funct
vecs = generate_vectors(model.wv.most_similar("0", topn=1000))
nid, prob = next(vecs)
nid, prob

  if np.issubdtype(vec.dtype, np.int):


('22461', 0.8493591547012329)

In [48]:
## Compute related links for a set of pages
nid_cid = build_dict(edges.source, edges.src_cid, edges.target, edges.dest_cid)
nid_url = build_dict(edges.source, edges.src_node, edges.target, edges.dest_node)
url_nid = build_dict(edges.src_node, edges.source, edges.dest_node, edges.target)

In [82]:
ignore_urls = ["/topic","/browse", "/help", "/cookies", "/email", "/y/", "/no/"]
def compute_top_n(df_path, n):
    pages_links = []
    missing = []
    for page in df_path.values:
        if page in url_nid.keys():
            target = str(url_nid[page])
            count = 0
            cids = []
            vecs = generate_vectors(model.wv.most_similar(target, topn=2000))
            while count <= n:
                nid, prob = next(vecs)
                if nid_cid[int(target)] != nid_cid[int(nid)]\
                and all(cond not in nid_url[int(nid)] for cond in ignore_urls)\
                and all(not nid_url[int(nid)].endswith(cond) for cond in ["/y", "/no"])\
                and nid_cid[int(nid)] not in cids:
                
#                 and nid_cid[int(nid)] not in cid_link_cids[nid_cid[int(target)]]:
                    
                    cids.append(nid_cid[int(nid)])
                    page_link = {"nid":int(target),
                                 "cid":nid_cid[int(target)],
                                 "base_path": page, 
                                 "link":nid_url[int(nid)], 
                                 "link_cid":nid_cid[int(nid)],
                                 "probability":round(prob,3)}
                    pages_links.append(page_link)
                    count+=1
        else:
            missing.append(page)
#             print("Page {} is missing from training set".format(page))
            
    return pd.DataFrame(pages_links), missing

In [88]:
top_250 = df1.sort_values("Pageviews", ascending=False).URL
results, misses = compute_top_n(top_250, 10)
results.shape, len(misses), len(set(top_250))-len(misses)

((6127, 6), 824, 557)

In [93]:
top_250.shape

(1381,)

In [89]:
results

Unnamed: 0,base_path,cid,link,link_cid,nid,probability
0,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/guidance/prepare-to-drive-in-the-eu-after-brexit,9f1621c2-1d3c-4a91-ad71-5366d3270dae,23749,0.965
1,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/government/publications/driving-in-the-eu-if-...,be15abf2-86fc-4bfc-9f76-edaac6346799,23749,0.878
2,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/guidance/passenger-travel-to-the-eu-by-air-ra...,dc917169-0dbb-48ea-b8be-afd8573cf6e7,23749,0.860
3,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/government/publications/prepare-to-drive-in-t...,c1a32196-47f3-47bb-a199-f03d49db6d92,23749,0.846
4,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/guidance/trailer-registration,f8ce6166-5069-4a43-a92d-5628027106ec,23749,0.838
5,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/prepare-eu-exit/transport,2d4ac9b6-d783-47d6-a3ce-4c51b0e697f9,23749,0.805
6,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/driving-abroad/international-driving-permit,e4d06cb9-9e2e-4e82-b802-0aad013ae16c,23749,0.781
7,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/prepare-eu-exit/going-and-being-abroad,d37a4cad-42ee-4a82-bb91-7de603848c72,23749,0.774
8,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/vehicle-insurance/driving-abroad,5305660e-bfba-4c41-b170-96dcfcf88099,23749,0.753
9,/guidance/international-driving-permits-for-uk...,87402d3f-e799-4d32-bfed-b0880f7c3b4e,/taking-vehicles-out-of-uk/for-less-than-12-mo...,406a1b8d-acaa-46fe-9296-f5a05508a303,23749,0.742


In [90]:
labelled_file = os.path.join(DATA_DIR, "content_api", "labelled.csv.gz")
labelled = pd.read_csv(labelled_file, 
                       compression="gzip", 
                       usecols=["base_path", "content_id", "title"])
base_cid = dict(zip(labelled.base_path, labelled.content_id))
cid_title = dict(zip(labelled.content_id, labelled.title))

In [91]:
results['title'] = results['cid'].map(cid_title)
results['link_title'] = results['link_cid'].map(cid_title)

In [92]:
results.to_csv(os.path.join(DATA_DIR, "results",\
                                 "n2v_brexit_results_orig_n2v.csv"), index=False)