In [9]:
import os
import pandas as pd
import gzip
import ijson
import itertools
import datetime
import json
import os
import re
import urllib.request
from collections import OrderedDict
from bs4 import BeautifulSoup
from lxml import html
from pandas.io.json import json_normalize

In [2]:
DATA_DIR = os.getenv("DATA_DIR")
content_api = os.path.join(DATA_DIR, "content_api")
content_file = os.path.join(content_api,"content.json.gz")

In [10]:
def get_links(url):
    links = []
    try:
        soup = BeautifulSoup(url, "html5lib")
        links = [link.get('href') for link in soup.findAll('a', href=True)]
    except Exception:
        print("error")
    return [l for l in links if l.startswith("/")]


look = ['title', 'body']
child_keys = ['title', 'description']
filtered = ['body', 'brand', 'documents', 'final_outcome_detail', 'final_outcome_documents',
            'government', 'headers', 'introduction', 'introductory_paragraph',
            'licence_overview', 'licence_short_description', 'logo', 'metadata', 'more_information', 'need_to_know',
            'other_ways_to_apply', 'summary', 'ways_to_respond', 'what_you_need_to_know', 'will_continue_on', 'parts',
            'collection_groups']


def is_html(raw_text):
    return html.fromstring(str(raw_text)).find('.//*') is not None


def is_json(raw_text):
    try:
        json_normalize(raw_text).columns.tolist()
    except AttributeError:
        return False
    return True


def get_text(x):
    links = []
    string_json = json.dumps(OrderedDict(x))
    order_json = json.loads(string_json, object_pairs_hook=OrderedDict)
    for key, raw_text in sorted(order_json.items()):
        if key in filtered:
            if isinstance(raw_text, str) and len(raw_text) > 1:
                links.extend(get_links(raw_text))
            elif isinstance(raw_text, list) and len(raw_text) > 0:
                for sub_text in raw_text:
                    if is_json(sub_text):
                        links.extend(nested_extract(sub_text))
                    elif is_html(sub_text):
                        links.extend(get_links(sub_text))
    return list(set(links))


def nested_extract(x):
    links = []
    string_json2 = json.dumps(OrderedDict(x))
    order_json2 = json.loads(string_json2, object_pairs_hook=OrderedDict)
    if ('body' or 'title') in order_json2.keys():
        for item in look:
            links.extend(get_links(order_json2[item]))
    elif 'child_sections' in order_json2.keys():
        for child in order_json2['child_sections']:
            for key in child_keys:
                links.extend(get_links(key))
    return links

In [11]:
def extract_link_types(content_item):
    links = []
    related_links = []
    coll_links = []
    
    if content_item is not None:
        links = get_text(content_item['details'])
        related_links = []
        coll_links = []
        if 'ordered_related_items' in content_item['links'].keys():
            related_links = [related_item['base_path'] for related_item in
                             content_item['links']['ordered_related_items'] if
                             'base_path' in related_item.keys()]

        if 'documents' in content_item['links'].keys():
            coll_links = [document['base_path'] for document in content_item['links']['documents'] if
                          'base_path' in document.keys()]
        
    return links, related_links, coll_links

In [None]:
destination = os.path.join(content_api, "content_reduced.json.gz")
print("Start:",datetime.datetime.now().strftime("%H:%M:%S"))
with gzip.open(content_file, "rt") as reader, gzip.open(destination, 'wb') as writer:
    content_generator = ijson.items(reader, prefix='item')
    for i,content_item in enumerate(itertools.islice(content_generator, 0, None)):
        row = {}
        row['base_path'] = content_item['base_path']
        row['content_id'] = content_item['content_id']
        row['title'] = content_item['title']
        row['description'] = content_item['description']
        row['details'] = content_item['details']
        l1, l2, l3 = extract_link_types(content_item)
        row['embdedded_links'] = l1
        row['related_links'] = l2
        row['coll_links'] = l3
        string_dict = json.dumps(row)
#         row_list.append(string_dict)
        writer.write("{}\n".format(string_dict).encode())
        if i % 10000 == 0:
            print("i:",i,datetime.datetime.now().strftime("%H:%M:%S"))
print("End:",datetime.datetime.now().strftime("%H:%M:%S"))

Start: 21:24:42
i: 0 21:24:42
i: 10000 21:27:03
i: 20000 21:29:19
i: 30000 21:31:35
i: 40000 21:33:57
i: 50000 21:36:16
i: 60000 21:38:34


In [None]:
row_list = []
with gzip.open(destination, 'rt') as reader:
    for line in reader.readlines():
        row_list.append(line)

In [None]:
df1 = pd.DataFrame([json.loads(s) for s in row_list])

In [None]:
df1.shape

In [None]:
df1.head(2)

In [None]:
df1.to_csv(destination, compression="gzip", index=False)