Similar but less detailed than the networkx tutorial. This notebook, in addition includes code on using community detection which works with undirected graph. It also demonstrates the GOV.UK search API which can be used to filter pages for a specific taxon. This can be used to generate a page list.

In [15]:
import os
import networkx as nx
import pandas as pd
import numpy as np
import gzip
import operator

## Load in data

In [2]:
output_dir = os.path.join(os.getenv("DATA_DIR"), "network_data")

In [3]:
nodes_path = os.path.join(output_dir, "for_networkx_tutorial_nodes.csv.gz")
edges_path = os.path.join(output_dir, "for_networkx_tutorial_edges.csv.gz")

In [4]:
# with gzip.open(nodes_path,"rt") as file:
#     print(file.readline())
#     for i,line in enumerate(file):
#         if len(line.replace("\n","").split("\t")) != 2:
#             print(line)

In [5]:
df_edges = pd.read_csv(edges_path, sep="\t", compression="gzip")

In [6]:
df_nodes = pd.read_csv(nodes_path, sep="\t", compression="gzip")

In [7]:
df_nodes.head()

Unnamed: 0,Node,Node_id
0,/,62
1,/ /t _blank,26028
2,/ vehicle tax,41504
3,/ vehicle-tax,21925
4,/-tax,43496


## Initialize graph

In [8]:
graph = nx.DiGraph()

In [9]:
for tup in df_edges.itertuples():
    graph.add_edge(tup.Source_node,tup.Destination_node,weight=tup.Weight)

In [14]:
for line in nx.info(graph).split("\n")[1:]:
    print(line)

Type: DiGraph
Number of nodes: 58743
Number of edges: 154219
Average in degree:   2.6253
Average out degree:   2.6253


In [20]:
list(graph.edges())[0:10]
for source,dest,data in list(graph.edges(data=True))[0:10]:
    print(source,dest,data)

/sure-start-maternity-grant/how-to-claim /government/publications/sure-start-maternity-grant-claim-form {'weight': 2177.0}
/sure-start-maternity-grant/how-to-claim /sure-start-maternity-grant/eligibility {'weight': 355.0}
/sure-start-maternity-grant/how-to-claim /sure-start-maternity-grant/how-to-claim {'weight': 267.0}
/sure-start-maternity-grant/how-to-claim /sure-start-maternity-grant {'weight': 159.0}
/sure-start-maternity-grant/how-to-claim /sure-start-maternity-grant/what-youll-get {'weight': 37.0}
/sure-start-maternity-grant/how-to-claim /browse/childcare-parenting/pregnancy-birth {'weight': 2.0}
/sure-start-maternity-grant/how-to-claim /child-tax-credit {'weight': 2.0}
/sure-start-maternity-grant/how-to-claim /government/publications/new-dwp-postal-addresses {'weight': 6.0}
/sure-start-maternity-grant/how-to-claim /sign-in-universal-credit {'weight': 5.0}
/sure-start-maternity-grant/how-to-claim /benefits-calculators {'weight': 3.0}


In [57]:
### Filter graph 
# graph_frequent = graph.edge_subgraph([(source,dest) for source,dest,data\
#                                       in graph.edges(data=True) if data['weight'] > 100]).copy()  
graph_frequent = nx.DiGraph()
for tup in df_edges.itertuples():
#     tup.Weight > 100 or
    if  any('brexit' in url for url in (tup.Source_node,tup.Destination_node)):
        graph_frequent.add_edge(tup.Source_node,tup.Destination_node,weight=tup.Weight)
        

In [58]:
for line in nx.info(graph_frequent).split("\n")[1:]:
    print(line)

Type: DiGraph
Number of nodes: 413
Number of edges: 720
Average in degree:   1.7433
Average out degree:   1.7433


## Centralities

In [59]:
%time degree_centralities = nx.degree_centrality(graph_frequent)
%time betweenness_centralities = nx.betweenness_centrality(graph_frequent, weight="weight")
%time load_centralities = nx.load_centrality(graph_frequent, weight="weight")

CPU times: user 658 µs, sys: 34 µs, total: 692 µs
Wall time: 697 µs
CPU times: user 401 ms, sys: 3.29 ms, total: 404 ms
Wall time: 403 ms
CPU times: user 202 ms, sys: 513 µs, total: 203 ms
Wall time: 203 ms


In [60]:
def centralities_print(centralities,i):
    max_key = max(centralities.items(), key=operator.itemgetter(1))[0]
    print("Maximum:",max_key,":",centralities[max_key],"\n")
    sorted_d = sorted(centralities.items(), key=operator.itemgetter(1), reverse=True)
    for ind,(key,value) in enumerate(list(sorted_d)):
#         if ind < i:
#             print(ind+1,key,":",value)
        if "brexit" in key:
            print(ind+1,key,":",value)

In [56]:
centralities_print(degree_centralities,20)

Maximum: / : 0.062458361092604935 

11 /guidance/passport-rules-for-travel-to-europe-after-brexit : 0.011325782811459028
21 /government/brexit : 0.007994670219853431
83 /guidance/prepare-to-drive-in-the-eu-after-brexit : 0.003997335109926716
152 /government/news/new-law-proposed-to-safeguard-uk-citizens-healthcare-abroad-after-brexit : 0.002831445702864757
190 /government/publications/travelling-to-the-eu-with-a-uk-passport-if-theres-no-brexit-deal/travelling-to-the-eu-with-a-uk-passport-if-theres-no-brexit-deal : 0.002498334443704197
199 /government/news/ip-and-brexit-the-facts : 0.002498334443704197
227 /world/brexit-ireland : 0.002331778814123917
230 /government/publications/trading-with-the-eu-if-theres-no-brexit-deal/trading-with-the-eu-if-theres-no-brexit-deal : 0.002331778814123917
337 /government/publications/meeting-climate-change-requirements-if-theres-no-brexit-deal/meeting-climate-change-requirements-if-theres-no-brexit-deal : 0.0018321119253830779
357 /government/publicati

## Out edges

In [63]:
target = '/guidance/passport-rules-for-travel-to-europe-after-brexit'
edge_weights = {out:data['weight'] for _,out,data in graph.out_edges([target],data=True)}

for key, value in reversed(sorted(edge_weights.items(), key=lambda x:x[1])):
    print("%s: %s" % (key, value))

/guidance/passport-rules-for-travel-to-europe-after-brexit: 1382.0
/government/publications/travelling-to-the-eu-with-a-uk-passport-if-theres-no-brexit-deal: 747.0
/government/brexit: 173.0
/apply-renew-passport: 120.0
/foreign-travel-advice: 93.0
/government/publications/travelling-to-the-eu-with-a-uk-passport-if-theres-no-brexit-deal/travelling-to-the-eu-with-a-uk-passport-if-theres-no-brexit-deal: 51.0
/government/news/blue-uk-passport-to-return-after-eu-exit: 42.0
/guidance/advice-for-british-nationals-travelling-and-living-in-europe: 34.0
/renew-adult-passport/renew: 31.0
/going-and-being-abroad/passports: 28.0
/going-and-being-abroad/travel-abroad: 26.0
/foreign-travel-advice/spain/entry-requirements: 23.0
/print/guidance/passport-rules-for-travel-to-europe-after-brexit: 20.0
/foreign-travel-advice/france/entry-requirements: 20.0
/: 19.0
/foreign-travel-advice/sweden/entry-requirements: 8.0
/get-a-passport-urgently: 8.0
/foreign-travel-advice/netherlands/entry-requirements: 6.0
/

## Search API

In [None]:
url_policy = 'https://www.gov.uk/api/search.json?filter_policies=brexit&count=1000'
url_taxon = 'https://www.gov.uk/api/search.json?filter_taxons=d6c2de5d-ef90-45d1-82d4-5f2438369eea&count=1000'

def search_api(url):
    conn = urllib.request.urlopen(url)
    results = conn.read().decode('utf-8')
    json_dict = json.loads(results)
#     print(len(json_dict['results']))
    links = [x['link'] for x in json_dict['results']]
    print("Found",len(links),"pages in",url)
    return links

links_policy = search_api(url_policy)
links_taxon = search_api(url_taxon)

links_policy.extend(x for x in links_taxon if x not in links_policy)
print("Total pages:",len(links_policy))

## Community detection

In [None]:
undir = graph.to_undirected()
%time louvain_comm = community.best_partition(undir)
print(len(set(louvain_comm.values())))

len_counter = Counter()
comm_nodes = {}
len_comms = {}
for comm in set(louvain_comm.values()) :
    list_nodes = [nodes for nodes in louvain_comm.keys()
                                if louvain_comm[nodes] == comm]
    comm_nodes[comm] = list_nodes
    length = len(list_nodes)
    len_counter[length]+=1
    if length in len_comms:
        len_comms[length].append(comm)
    else:
        len_comms[length] = [comm]
print(len_counter.most_common(5))