In [1]:
import os 
import pprint
import gzip
import ijson
from collections import OrderedDict
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import json
import requests
import urllib.request
from bs4 import BeautifulSoup
from lxml import html

## Directories

In [18]:
processed_network = os.path.join(os.getenv("DATA_DIR"), "processed_network")

In [19]:
os.listdir(processed_network)

['taxon_28_31_01_sample30_nodes.csv.gz',
 'taxon_28_31_01_edges_doo.csv.gz',
 'taxon_28_31_01_nodes_doo.csv.gz',
 'taxon_28_31_01_sample30_edges.csv.gz',
 'for_networkx_tutorial_nodes.csv.gz',
 'for_networkx_tutorial_edges.csv.gz']

In [62]:
nodefile = os.path.join(processed_network, "sampled_clean_nodes.csv.gz")

In [63]:
nodes = pd.read_csv(nodefile, compression="gzip", sep= "\t")

In [64]:
nodes.head()

Unnamed: 0,Node
0,/government/publications/guidance-for-dependan...
1,/visa-fees
2,/find-a-visa-application-centre
3,/entering-staying-uk/family-visas
4,/uk-family-visa


In [66]:
nodes.shape

(126424, 1)

### Link extraction function

In [67]:
def get_links(url):
    soup = BeautifulSoup(url, "html5lib")
    links = [link.get('href') for link in soup.findAll('a' , href=True)]
    return [l for l in links if l.startswith("/")]

## Nested link extraction for in-text links

In [68]:
look = ['title', 'body']
child_keys = ['title', 'description']
filtered = ['body', 'brand', 'documents', 'final_outcome_detail', 'final_outcome_documents',
            'government', 'headers', 'introduction', 'introductory_paragraph',
            'licence_overview', 'licence_short_description', 'logo', 'metadata', 'more_information', 'need_to_know',
            'other_ways_to_apply', 'summary', 'ways_to_respond', 'what_you_need_to_know', 'will_continue_on', 'parts',
            'collection_groups']

def is_html(raw_text):
    return html.fromstring(str(raw_text)).find('.//*') is not None

def is_json(raw_text):
    try:
        json_normalize(raw_text).columns.tolist()
    except AttributeError:
        return False
    return True

def get_text(x):
    links = []
    string_json = json.dumps(OrderedDict(x))
    order_json = json.loads(string_json, object_pairs_hook=OrderedDict)
    for key, raw_text in sorted(order_json.items()):
        if key in filtered:
            if isinstance(raw_text, str) and len(raw_text) > 1:
                    links.extend(get_links(raw_text))
            elif isinstance(raw_text, list) and len(raw_text) > 0:
                for sub_text in raw_text:
                    if is_json(sub_text):
                        links.extend(nested_extract(sub_text))
                    elif is_html(sub_text):
                        links.extend(get_links(sub_text))
    return list(set(links))


def nested_extract(x):
    links = []
    string_json2 = json.dumps(OrderedDict(x))
    order_json2 = json.loads(string_json2, object_pairs_hook=OrderedDict)
    if ('body' or 'title') in order_json2.keys():
        for item in look:
            links.extend(get_links(order_json2[item]))
    elif 'child_sections' in order_json2.keys():
        for child in order_json2['child_sections']:
            for key in child_keys:
                links.extend(get_links(key))
    return links

## Extract links (in-text and related items) from `details` and `links` respectively

#### Apart from in-text and related links, there's the `documents` entry in `links` which contains the links to the contents of a collection page

In [78]:
url = "https://www.gov.uk/api/content/entering-staying-uk/family-visas"
# https://www.gov.uk/api/content/find-a-visa-application-centre
response = requests.get(url)
print(type(response))
d = json.loads(urllib.request.urlopen(url).read())
print(d.keys())

<class 'requests.models.Response'>
dict_keys(['analytics_identifier', 'base_path', 'content_id', 'document_type', 'first_published_at', 'locale', 'phase', 'public_updated_at', 'publishing_app', 'publishing_scheduled_at', 'rendering_app', 'scheduled_publishing_delay_seconds', 'schema_name', 'title', 'updated_at', 'withdrawn_notice', 'publishing_request_id', 'links', 'description', 'details'])


In [87]:
import datetime

In [None]:
page_links = {}
related_page_links = {}
collection_links = {}
not_found= []
json_dict = {}

for i,tup in enumerate(nodes.itertuples()):
    content_item = None
    try:
        url = "https://www.gov.uk/api/content" + tup.Node
        content_item = json.loads(urllib.request.urlopen(url).read())
        json_dict[node] = content_item
    except Exception:
        not_found.append(url)
        
    if content_item is not None:
        links = get_text(content_item['details'])
        related_links = []
        coll_links = []
        if 'ordered_related_items' in content_item['links'].keys():
            related_links = [related_item['base_path'] for related_item in content_item['links']['ordered_related_items']\
                                 if 'base_path' in related_item.keys()]

        if 'documents' in content_item['links'].keys():
            coll_links = [document['base_path'] for document in content_item['links']['documents']\
                                 if 'base_path' in document.keys()]

        related_page_links[content_item['base_path']] = related_links
        collection_links[content_item['base_path']] = coll_links
        page_links[content_item['base_path']] = links

    if i%10000==0:
        print(datetime.datetime.now().strftime("%H:%M:%S"),i)

13:32:59 0


In [None]:
len(json_dict)