In [None]:
### TODO: probably merge a month's worth of data, nonsense/404 urls are suprisinly frequent

In [1]:
import pandas as pd
import os

### Load in edge data

In [24]:
DATA_DIR = os.getenv("DATA_DIR")
processed_network = os.path.join(DATA_DIR, "processed_network")
[f for f in os.listdir(processed_network) if "edges" in f]

['feb_01_10_doo_nos_edges.csv.gz',
 'feb_11_18_nos_edges.csv.gz',
 'edges_graphsage_test_feb_01_18_doo.csv.gz',
 'for_networkx_tutorial_edges.csv.gz',
 'feb_11_18_doo_nos_edges.csv.gz']

In [77]:
gt = pd.read_csv(os.path.join(processed_network, "graphsage_test.csv.gz"), compression="gzip", sep='\t')
gt.shape

(454260, 3)

In [78]:
processed_list = [os.path.join(processed_network, f) for f in ["feb_11_18_doo_nos_edges.csv.gz",
                                                                "feb_11_18_nos_edges.csv.gz"]]
feb_doo = pd.concat([pd.read_csv(f, compression="gzip", sep="\t") for f in processed_list])
feb_doo.shape

(3073339, 5)

In [79]:
feb_doo.head()

Unnamed: 0,Source_node,Source_id,Destination_node,Destination_id,Weight
0,/,0,/,0,92079
1,/jobsearch,1,/jobsearch,1,50413
2,/government/organisations/hm-revenue-customs/c...,2,/government/organisations/hm-revenue-customs/c...,2,5614
3,/check-mot-history,3,/check-mot-history,3,193937
4,/government/organisations/hm-revenue-customs/c...,4,/government/organisations/hm-revenue-customs/c...,4,4369


In [80]:
feb_doo = feb_doo[feb_doo.Source_node!=feb_doo.Destination_node]
feb_doo.shape

(2926418, 5)

In [81]:
feb_doo.Weight.min()

1

In [82]:
edge_weight = dict(zip(zip(gt.source,gt.target),gt.weight))

In [83]:
('/government/publications/guidance-for-dependants-of-uk-visa-applicants-tiers-1-2-4-5',
   '/visa-fees') in edge_weight.keys()

True

In [84]:
# edge_weight = {}
for tup in feb_doo.itertuples():
    if (tup.Source_node, tup.Destination_node) in edge_weight.keys():
        edge_weight[(tup.Source_node, tup.Destination_node)] += tup.Weight
    else:
        edge_weight[(tup.Source_node, tup.Destination_node)] = tup.Weight
len(edge_weight)

2753533

In [85]:
rowlist = [{"src_node":src ,"dest_node": dest, "weight": w} for (src, dest),w in edge_weight.items()]
edges_merged = pd.DataFrame(rowlist, columns=["src_node","dest_node","weight"])
edges_merged.head()

Unnamed: 0,src_node,dest_node,weight
0,/government/publications/guidance-for-dependan...,/visa-fees,66
1,/visa-fees,/find-a-visa-application-centre,1164
2,/entering-staying-uk/family-visas,/uk-family-visa,377
3,/uk-family-visa,/uk-family-visa/partner-spouse,26227
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,2733


In [86]:
edges_merged.shape, edges_merged[edges_merged.weight==2].shape

((2753533, 3), (245797, 3))

In [87]:
edges_merged.weight.describe().apply(lambda x: format(x, 'f'))

count    2753533.000000
mean          40.581202
std         6529.100705
min            1.000000
25%            1.000000
50%            1.000000
75%            2.000000
max      8308677.000000
Name: weight, dtype: object

In [93]:
edges_merged[edges_merged.weight >= 15].shape

(217703, 3)

### Preliminary filtered dataset

In [96]:
edges_fm = edges_merged[edges_merged.weight >= 15].copy(deep=True)

In [95]:
edges_merged[edges_merged.src_node=="/send-rent"]

Unnamed: 0,src_node,dest_node,weight
178171,/send-rent,/send-rent-lease-details,32


### Check whether urls here exist in either `content` or `content_api_extract` or `labelled`

In [40]:
os.listdir(DATA_DIR)

['not_found_urls.csv',
 '.DS_Store',
 'predict_network',
 'content_ids.csv',
 'content_api',
 'train_network',
 'processed_network',
 'content_json.csv.gz',
 'top-pages-govuk-feb19.tsv',
 'processed',
 'raw',
 'content_api_links.csv.gz']

In [41]:
content_api = os.path.join(DATA_DIR, "content_api")
content_api_feb = os.path.join(content_api, "07-02-19")
os.listdir(content_api), os.listdir(content_api_feb)

(['.DS_Store', 'content.json.gz', '21-02-19', 'labelled.csv.gz', '07-02-19'],
 ['content_json.csv.gz', 'content_api_links.csv.gz'])

In [42]:
labelled = pd.read_csv(os.path.join(content_api, "labelled.csv.gz"), compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [43]:
edges = set(list(edges_fm.src_node)+list(edges_fm.dest_node))
list(edges)[0:10], len(edges)   

(['/government/publications/department-for-environment-food-and-rural-affairs-single-departmental-plan/department-for-environment-food-and-rural-affairs-single-departmental-plan',
  '/calculate-employee-redundancy-pay/y/2019-03-31/33',
  '/government/publications?departments[]=department-for-exiting-the-european-union&page=2',
  '/state-pension-age/y/age/1962-12-21',
  '/student-finance-calculator/y/2018-2019/uk-full-time/9250.0/at-home/31000.0/no/none-of-the-above',
  '/government/publications/secure-by-design/code-of-practice-for-consumer-iot-security',
  '/calculate-your-child-maintenance/y/pay/1_child/none/430.0',
  '/state-pension-age/y/age/1952-08-15',
  '/calculate-your-holiday-entitlement/y/hours-worked-per-week/starting/2019-01-02/2018-04-01',
  '/calculate-your-child-maintenance/y/pay/1_child/none/935.5/0'],
 112341)

In [74]:
not_labelled = len(edges.difference(set(labelled.base_path)))

In [75]:
"{}% of pages in edges from BQ are not in labelled dataset".format(round((not_labelled*100)/len(edges), 3))

'78.613% of pages in edges from BQ are not in labelled dataset'

The above figure makes sense because BQ urls are usually not stripped down

In [46]:
api_extract = pd.read_csv(os.path.join(content_api_feb, "content_json.csv.gz"), compression="gzip")

In [47]:
api_extract.head(2)

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,links,locale,publishing_app,redirects,rendering_app,title,url
0,/government/publications/guidance-for-dependan...,5ef7560d-7631-11e4-a3cb-005056011aef,Full guidance on the policy for applications b...,"{'body': '<div class=""govspeak""><p>This guidan...",guidance,2013-11-12T00:00:00.000+00:00,{'organisations': [{'analytics_identifier': 'O...,en,whitehall,,government-frontend,Guidance for dependants of UK visa applicants ...,/government/publications/guidance-for-dependan...
1,/visa-fees,1e333395-5dd5-4452-96a3-fbe939928761,Check how much your visa application costs in ...,{'introductory_paragraph': '<p>Use this tool t...,transaction,2014-01-22T14:39:37.000+00:00,{'mainstream_browse_pages': [{'api_path': '/ap...,en,publisher,,frontend,Visa fees,/visa-fees


In [48]:
missing = list(edges.difference(set(api_extract.url)))
len(missing)

33057

In [76]:
"{}% of pages in edges from BQ are not in Content API extract dataset".format(round((len(missing)*100)/len(edges), 3))

'29.092% of pages in edges from BQ are not in Content API extract dataset'

In [50]:
missing[0:10]

['/government/publications/department-for-environment-food-and-rural-affairs-single-departmental-plan/department-for-environment-food-and-rural-affairs-single-departmental-plan',
 '/search-register-planning-decisions/tameside',
 '/state-pension-age/y/age/1963-01-24/male',
 '/government/publications/visas-and-travel-requirements',
 '/calculate-your-child-maintenance/y/pay/1_child/none/935.5/0',
 '/government/publications/exporting-to-singapore',
 '/world/organisations/british-embassy-dakar/office/visa-section',
 '/print/foreign-travel-advice/france/print',
 '/pay-leave-for-parents/y/yes/2019-05-22/employee/self-employed/no/yes/yes',
 '/guidance/finish-setting-up-your-agent-services-account-if-you-are-overseas']

In [51]:
### Remove some /print junk
missing = list(set([m.replace("/print", "") for m in missing]))
len(missing)

32682

### Save out missing links and extract their text with a `content_api_extract` run

In [53]:
miss = pd.DataFrame([{"Node":m} for m in missing])
miss.to_csv(os.path.join(processed_network, "missing_feb_01_18_v2.csv.gz"), index=False, compression="gzip")

In [63]:
[m for m in missing if "send-rent" in m]

['/send-rent-',
 '/send-rent-lease-details',
 '/send-rent',
 '/send-rent-release-details',
 '/send-rent-lease.details',
 '/send-rent-lease',
 '/send-rent-leases-details',
 '/send-rent-lease-detaisl',
 '/send-rentlease-details',
 '/send-rent-leasedetails',
 '/send-rent-lease-detals',
 '/send-rent-lease-deatails',
 '/send-rent-lease-deatils',
 '/send-rent-details',
 '/send-rent-lease-detail',
 '/send-rent-lease details',
 '/send-rent-lease-deails']

### Save out `edges_fm`

In [97]:
edges_fm.to_csv(os.path.join(processed_network, "edges_graphsagetest_feb_01_18_doo_min15weight.csv.gz"), 
                index=False, compression="gzip")