In [67]:
import numpy as np
import pandas as pd
from lxml import etree

In [68]:
# Get data file locations

CONTENT_INPUT = '../../data/raw_content.json'
CONTENT_OUTPUT = '../../data/clean_content.csv'

In [69]:
#download the taxon data from content store for all links which are taxons
content = pd.read_json(
    CONTENT_INPUT, 
    orient='table', 
    typ='frame', 
    dtype=True, 
    convert_axes=True, 
    convert_dates=True, 
    keep_default_dates=True, 
    numpy=False, 
    precise_float=False, 
    date_unit=None
)


In [70]:
#content.head()

In [71]:
#content['details'][0]

In [72]:
#content['details'][50].get('body')

In [73]:
content = content.assign(body = [d.get('body') for d in content.details])

In [74]:
# Clean the html

def extract_text(body):
    
    r = None
    #if lxml.html.fromstring(html).find('.//*') is not None:
    # This is horribly hacky. Previously this was failing on single new line
    # characters.
    if body and body != '\n':
        tree = etree.HTML(body)
        r = tree.xpath('//text()')
        r = ' '.join(r)
        r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        r = r.replace('\n', ' ').replace(',', ' ')
        r = r.lower()
        r = ' '.join(r.split())
    if not r:
        r = ' '
    return r

In [76]:
content = content.assign(body = content['body'].apply(extract_text))
content = content.assign(description = content['description'].apply(extract_text))
content = content.assign(title = content['title'].apply(extract_text))

In [77]:
content['combined_text'] = content['title'] + ' ' + content['description'] + ' ' + content['body']

In [79]:
content['taxons'] = content['taxons'].where((pd.notnull(content['taxons'])), None)
#content['taxons'] = content['taxons'].fillna(value=None)

In [81]:
content.columns

Index(['base_path', 'content_id', 'description', 'details', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'taxons', 'title', 'body', 'combined_text'],
      dtype='object')

In [83]:
content_columns = content.drop(['taxons'], axis=1).columns.values

In [84]:
content_wide = pd.concat([content.drop('taxons', axis=1), content['taxons'].apply(pd.Series)], axis=1)

In [86]:
content_long = pd.melt(content_wide, id_vars=content_columns, value_name='taxon')

In [88]:
content_long.columns

Index(['base_path', 'content_id', 'description', 'details', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'body', 'combined_text', 'variable',
       'taxon'],
      dtype='object')

In [94]:
content_long['taxon'][4]

{'content_id': 'ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb',
 'title': 'Statutory leave and time off'}

In [122]:
content_wide[0:10]

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,...,53,54,55,56,57,58,59,60,61,62
0,/government/organisations/companies-house,c36bd301-d0c5-4492-86ad-ee7843b8383b,,{'brand': 'department-for-business-innovation-...,organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,companies house,...,,,,,,,,,,
1,/government/organisations/hm-revenue-customs,6667cce2-e809-4e21-ae09-cb0bdc1ddda3,,"{'brand': 'hm-revenue-customs', 'logo': {'form...",organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,hm revenue & customs,...,,,,,,,,,,
2,/government/organisations/driver-and-vehicle-l...,70580624-93b5-4aed-823b-76042486c769,,"{'brand': 'department-for-transport', 'logo': ...",organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,driver and vehicle licensing agency,...,,,,,,,,,,
3,/government/publications,b13317e9-3753-47b2-95da-c173071e621d,find publications from across government inclu...,{},finder,2016-11-14T16:28:53.000+00:00,en,,whitehall,all publications,...,,,,,,,,,,
4,/bank-holidays,58f79dbd-e57f-4ab2-ae96-96df5767d1b2,,{},calendar,2016-02-29T09:24:10.000+00:00,en,,calendars,uk bank holidays,...,,,,,,,,,,
5,/check-uk-visa,dc1a1744-4089-43b3-b2e3-4e397b6b15b1,you may need a visa to come to the uk to visit...,{'introductory_paragraph': '<p>You may need a ...,transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,check if you need a uk visa,...,,,,,,,,,,
6,/state-pension-age,5491c439-1c83-4044-80d3-32cc3613b739,work out your state pension age and pension cr...,{'introductory_paragraph': '<p>Your State Pens...,transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,check your state pension age,...,,,,,,,,,,
7,/government/organisations/land-registry,5c54ae52-341b-499e-a6dd-67f04633b8cf,,{'brand': 'department-for-business-innovation-...,organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,hm land registry,...,,,,,,,,,,
8,/government/organisations/uk-visas-and-immigra...,04148522-b0c1-4137-b687-5f3c3bdd561a,,"{'brand': 'home-office', 'logo': {'formatted_t...",organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,uk visas and immigration,...,,,,,,,,,,
9,/government/announcements,88936763-df8a-441f-8b96-9ea0dc0758a1,find news articles speeches and statements fro...,{},finder,2016-11-14T16:28:53.000+00:00,en,,whitehall,government announcements,...,,,,,,,,,,


In [123]:
mask = content_long['taxon'].isnull()
content_long = content_long.drop([mask], axis=0)

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [109]:
content_long.shape

(12883815, 14)