In [1]:
import datetime

In [2]:
import os 
import pprint
import gzip
import ijson
from collections import OrderedDict
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import json
import requests
import urllib.request
from bs4 import BeautifulSoup
from lxml import html

## Directories

In [3]:
DATA_DIR = os.getenv("DATA_DIR")

In [4]:
processed_network = os.path.join(DATA_DIR, "processed_network")

In [5]:
os.listdir(processed_network)

['taxon_28_31_01_sample30_cs_nodes.csv.gz',
 'taxon_28_31_01_sample30_nodes.csv.gz',
 'taxon_28_31_01_sample30_cs_edges.csv.gz',
 'taxon_28_31_01_edges_doo.csv.gz',
 'taxon_28_31_01_nodes_doo.csv.gz',
 'taxon_28_31_01_sample30_edges.csv.gz',
 'for_networkx_tutorial_nodes.csv.gz',
 'for_networkx_tutorial_edges.csv.gz',
 'sampled_clean_nodes.csv.gz']

In [62]:
nodefile = os.path.join(processed_network, "sampled_clean_nodes.csv.gz")

In [63]:
nodes = pd.read_csv(nodefile, compression="gzip", sep= "\t")

In [64]:
nodes.head()

Unnamed: 0,Node
0,/government/publications/guidance-for-dependan...
1,/visa-fees
2,/find-a-visa-application-centre
3,/entering-staying-uk/family-visas
4,/uk-family-visa


In [66]:
nodes.shape

(126424, 1)

### Link extraction function

In [67]:
def get_links(url):
    soup = BeautifulSoup(url, "html5lib")
    links = [link.get('href') for link in soup.findAll('a' , href=True)]
    return [l for l in links if l.startswith("/")]

## Nested link extraction for in-text links

In [68]:
look = ['title', 'body']
child_keys = ['title', 'description']
filtered = ['body', 'brand', 'documents', 'final_outcome_detail', 'final_outcome_documents',
            'government', 'headers', 'introduction', 'introductory_paragraph',
            'licence_overview', 'licence_short_description', 'logo', 'metadata', 'more_information', 'need_to_know',
            'other_ways_to_apply', 'summary', 'ways_to_respond', 'what_you_need_to_know', 'will_continue_on', 'parts',
            'collection_groups']

def is_html(raw_text):
    return html.fromstring(str(raw_text)).find('.//*') is not None

def is_json(raw_text):
    try:
        json_normalize(raw_text).columns.tolist()
    except AttributeError:
        return False
    return True

def get_text(x):
    links = []
    string_json = json.dumps(OrderedDict(x))
    order_json = json.loads(string_json, object_pairs_hook=OrderedDict)
    for key, raw_text in sorted(order_json.items()):
        if key in filtered:
            if isinstance(raw_text, str) and len(raw_text) > 1:
                    links.extend(get_links(raw_text))
            elif isinstance(raw_text, list) and len(raw_text) > 0:
                for sub_text in raw_text:
                    if is_json(sub_text):
                        links.extend(nested_extract(sub_text))
                    elif is_html(sub_text):
                        links.extend(get_links(sub_text))
    return list(set(links))


def nested_extract(x):
    links = []
    string_json2 = json.dumps(OrderedDict(x))
    order_json2 = json.loads(string_json2, object_pairs_hook=OrderedDict)
    if ('body' or 'title') in order_json2.keys():
        for item in look:
            links.extend(get_links(order_json2[item]))
    elif 'child_sections' in order_json2.keys():
        for child in order_json2['child_sections']:
            for key in child_keys:
                links.extend(get_links(key))
    return links

## Extract links (in-text and related items) from `details` and `links` respectively

#### Apart from in-text and related links, there's the `documents` entry in `links` which contains the links to the contents of a collection page

In [78]:
url = "https://www.gov.uk/api/content/entering-staying-uk/family-visas"
response = requests.get(url)
print(type(response))
d = json.loads(urllib.request.urlopen(url).read())
print(d.keys())

<class 'requests.models.Response'>
dict_keys(['analytics_identifier', 'base_path', 'content_id', 'document_type', 'first_published_at', 'locale', 'phase', 'public_updated_at', 'publishing_app', 'publishing_scheduled_at', 'rendering_app', 'scheduled_publishing_delay_seconds', 'schema_name', 'title', 'updated_at', 'withdrawn_notice', 'publishing_request_id', 'links', 'description', 'details'])


In [None]:
# # nodes[nodes.Node=="/world/living-in-venezuela"]
# # 35911
# # Final index : 126423	
# nodes.shape
# (35911*100)/126424 28.405207871923054% covered
# # nodes[nodes.Node=="/world/living-in-venezuela"]
# # start index for pt2 35911, end (not including) 10000

In [171]:
nodes.iloc[35911]

Node    /uk-benefits-abroad/y/already_abroad/child_ben...
Name: 35911, dtype: object

In [173]:
35911+34776

70687

In [172]:
nodes.iloc[35911+34776]

Node    /calculate-your-holiday-entitlement/y/days-wor...
Name: 70687, dtype: object

In [161]:
def save_all_to_file(json_dict, page_links, related_page_links, collection_links, pre_fix):
    print("Number of pages for links:", len(page_links))
    print("Number of pages for json:", len(json_dict))
    
#     rows_json = [value for key,value in json_dict.items()]
    rows_json = []
    for key,value in json_dict.items():
        tempo = value
        tempo['url'] = key
        rows_json.append(tempo)
    
    json_df = pd.DataFrame(rows_json) 
    json_df.drop(['analytics_identifier','phase', 'public_updated_at', 
         'publishing_request_id', 'publishing_scheduled_at',
         'scheduled_publishing_delay_seconds', 'schema_name',
         'updated_at', 'withdrawn_notice'], axis=1, inplace = True)

    json_df.to_csv(os.path.join(DATA_DIR,pre_fix+"content_json.csv.gz"), compression="gzip", index=False)
    
    rows_links = [{"url":key,
                          "embedded_links":value,
                          "related_links":related_page_links[key],
                          "collection_links":collection_links[key]} for key,value in page_links.items()]
    df_rel = pd.DataFrame(rows_links) 
    df_rel = df_rel[['url', 'embedded_links', 'related_links','collection_links']]
    
    df_rel['num_rel'] = df_rel['related_links'].map(len)
    df_rel['num_emb'] = df_rel['embedded_links'].map(len)
    df_rel['num_coll'] = df_rel['collection_links'].map(len)
    
    df_rel.to_csv(os.path.join(DATA_DIR,pre_fix+"content_api_links.csv.gz"), index=False, compression="gzip")
    
    return json_df, df_rel

In [164]:
page_links = {}
related_page_links = {}
collection_links = {}
not_found= []
json_dict = {}
# 35911 100000
for i,tup in enumerate(nodes[70687:100000].itertuples()):
    content_item = None
    try:
        url = "https://www.gov.uk/api/content" + tup.Node
        content_item = json.loads(urllib.request.urlopen(url).read())
        content_item['url'] = tup.Node
#         {key:value for key,value in content_item.items() if key in ['base_path']}
        json_dict[tup.Node] = content_item

    except Exception:
        not_found.append(url)
    
    if content_item is not None:
#         print("ok")
        links = get_text(content_item['details'])
        related_links = []
        coll_links = []
        if 'ordered_related_items' in content_item['links'].keys():
            related_links = [related_item['base_path'] for related_item in content_item['links']['ordered_related_items']\
                                 if 'base_path' in related_item.keys()]

        if 'documents' in content_item['links'].keys():
            coll_links = [document['base_path'] for document in content_item['links']['documents']\
                                 if 'base_path' in document.keys()]

        related_page_links[content_item['base_path']] = related_links
        collection_links[content_item['base_path']] = coll_links
        page_links[content_item['base_path']] = links

    if i%10000==0:
        print(datetime.datetime.now().strftime("%H:%M:%S"),i)

18:44:58 0
19:33:52 10000
20:35:08 20000
21:39:27 30000


KeyboardInterrupt: 

In [165]:
for item in [json_dict, page_links, related_page_links, collection_links]:
    print(len(item))

34777
14650
14650
14650


## Save to file

In [166]:
json_df, df_rel = save_all_to_file(json_dict, page_links, related_page_links, collection_links, "pt2")

Number of pages for links: 14650
Number of pages for json: 34777


In [168]:
json_df.iloc[-1]

base_path                           /calculate-your-child-maintenance/y
content_id                         064f2c7e-e838-4e45-8ab0-95613eb9d409
description                                                        None
details               {'external_related_links': [{'title': 'Child M...
document_type                                              smart_answer
first_published_at                        2017-07-11T13:56:53.000+00:00
links                 {'available_translations': [{'title': 'Child m...
locale                                                               en
publishing_app                                             smartanswers
redirects                                                           NaN
rendering_app                                              smartanswers
title                                      Child maintenance calculator
url                   /calculate-your-child-maintenance/y/receive/1_...
Name: 34776, dtype: object

In [169]:
df_rel.iloc[-1] 

url                 /hmrc-internal-manuals/capital-gains-manual/cg...
embedded_links                                                     []
related_links                                                      []
collection_links                                                   []
num_rel                                                             0
num_emb                                                             0
num_coll                                                            0
Name: 14649, dtype: object

In [170]:
nodes[nodes.Node=="/calculate-your-child-maintenance/y"]

Unnamed: 0,Node
772,/calculate-your-child-maintenance/y


In [None]:
nodes[nodes.Node=="/calculate-your-child-maintenance/y"]

In [10]:
file_test = "pt1of5_content_json.csv.gz"
df_test = pd.read_csv(os.path.join(DATA_DIR, file_test))
df_test.shape

(19167, 13)

In [11]:
df_test.head()

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,links,locale,publishing_app,redirects,rendering_app,title,url
0,/government/publications/contracts-for-differe...,45cc1112-1291-458e-85fa-4929aaf90761,The draft Allocation Framework sets out the ru...,"{'body': '<div class=""govspeak""><p>This is the...",guidance,2019-01-21T13:56:55.000+00:00,{'document_collections': [{'api_path': '/api/c...,en,whitehall,,government-frontend,Contracts for Difference: Allocation Framework...,/government/publications/contracts-for-differe...
1,/am-i-getting-minimum-wage/y,43cc9c0c-4210-4643-b045-53a388bbc36f,,{'external_related_links': []},smart_answer,2017-07-11T13:56:52.000+00:00,{'available_translations': [{'title': 'Nationa...,en,smartanswers,,smartanswers,National Minimum Wage and Living Wage calculat...,/am-i-getting-minimum-wage/y/current_payment/n...
2,/am-i-getting-minimum-wage/y,43cc9c0c-4210-4643-b045-53a388bbc36f,,{'external_related_links': []},smart_answer,2017-07-11T13:56:52.000+00:00,{'available_translations': [{'title': 'Nationa...,en,smartanswers,,smartanswers,National Minimum Wage and Living Wage calculat...,/am-i-getting-minimum-wage/y/current_payment/n...
3,/am-i-getting-minimum-wage/y,43cc9c0c-4210-4643-b045-53a388bbc36f,,{'external_related_links': []},smart_answer,2017-07-11T13:56:52.000+00:00,{'available_translations': [{'title': 'Nationa...,en,smartanswers,,smartanswers,National Minimum Wage and Living Wage calculat...,/am-i-getting-minimum-wage/y/current_payment/n...
4,/state-pension-age/y,cdca2cab-da55-4184-abb3-af27764dd756,,{'external_related_links': []},smart_answer,2017-07-11T13:56:43.000+00:00,{'available_translations': [{'title': 'Check y...,en,smartanswers,,smartanswers,Check your State Pension age,/state-pension-age/y/age/1954-05-16/female


In [13]:
[i for i in []]

[]