In [1]:
import numpy as np
import pandas as pd
from lxml import etree

In [2]:
# Get data file locations

CONTENT_INPUT = '../../data/raw_content.json'
CONTENT_OUTPUT = '../../data/clean_content.csv'

In [3]:
#download the taxon data from content store for all links which are taxons
content = pd.read_json(
    CONTENT_INPUT, 
    orient='table', 
    typ='frame', 
    dtype=True, 
    convert_axes=True, 
    convert_dates=True, 
    keep_default_dates=True, 
    numpy=False, 
    precise_float=False, 
    date_unit=None
)


In [4]:
#content.head()

In [5]:
#content['details'][0]

In [6]:
#content['details'][50].get('body')

In [7]:
content = content.assign(body = [d.get('body') for d in content.details])

In [8]:
# Clean the html

def extract_text(body):
    
    r = None
    #if lxml.html.fromstring(html).find('.//*') is not None:
    # This is horribly hacky. Previously this was failing on single new line
    # characters.
    if body and body != '\n':
        tree = etree.HTML(body)
        r = tree.xpath('//text()')
        r = ' '.join(r)
        r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        r = r.replace('\n', ' ').replace(',', ' ')
        r = r.lower()
        r = ' '.join(r.split())
    if not r:
        r = ' '
    return r

In [9]:
content = content.assign(body = content['body'].apply(extract_text))
content = content.assign(description = content['description'].apply(extract_text))
content = content.assign(title = content['title'].apply(extract_text))

In [10]:
content['combined_text'] = content['title'] + ' ' + content['description'] + ' ' + content['body']

In [11]:
content['taxons'] = content['taxons'].where((pd.notnull(content['taxons'])), None)
#content['taxons'] = content['taxons'].fillna(value=None)

In [12]:
content.columns

Index(['base_path', 'content_id', 'description', 'details', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'taxons', 'title', 'body', 'combined_text'],
      dtype='object')

In [13]:
content_columns = content.drop(['taxons'], axis=1).columns.values

In [14]:
content_wide = pd.concat([content.drop('taxons', axis=1), content['taxons'].apply(pd.Series)], axis=1)

In [15]:
content_long = pd.melt(content_wide, id_vars=content_columns, value_name='taxon')

In [16]:
content_long.columns

Index(['base_path', 'content_id', 'description', 'details', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'body', 'combined_text', 'variable',
       'taxon'],
      dtype='object')

In [17]:
content_long['taxon'][4]

{'content_id': 'ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb',
 'title': 'Statutory leave and time off'}

In [18]:
content_wide[0:10]

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,...,53,54,55,56,57,58,59,60,61,62
0,/government/organisations/companies-house,c36bd301-d0c5-4492-86ad-ee7843b8383b,,{'brand': 'department-for-business-innovation-...,organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,companies house,...,,,,,,,,,,
1,/government/organisations/hm-revenue-customs,6667cce2-e809-4e21-ae09-cb0bdc1ddda3,,"{'brand': 'hm-revenue-customs', 'logo': {'form...",organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,hm revenue & customs,...,,,,,,,,,,
2,/government/organisations/driver-and-vehicle-l...,70580624-93b5-4aed-823b-76042486c769,,"{'brand': 'department-for-transport', 'logo': ...",organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,driver and vehicle licensing agency,...,,,,,,,,,,
3,/government/publications,b13317e9-3753-47b2-95da-c173071e621d,find publications from across government inclu...,{},finder,2016-11-14T16:28:53.000+00:00,en,,whitehall,all publications,...,,,,,,,,,,
4,/bank-holidays,58f79dbd-e57f-4ab2-ae96-96df5767d1b2,,{},calendar,2016-02-29T09:24:10.000+00:00,en,,calendars,uk bank holidays,...,,,,,,,,,,
5,/check-uk-visa,dc1a1744-4089-43b3-b2e3-4e397b6b15b1,you may need a visa to come to the uk to visit...,"{'more_information': ' ', 'transaction_start_l...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,check if you need a uk visa,...,,,,,,,,,,
6,/state-pension-age,5491c439-1c83-4044-80d3-32cc3613b739,work out your state pension age and pension cr...,"{'more_information': ' ', 'transaction_start_l...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,check your state pension age,...,,,,,,,,,,
7,/government/organisations/land-registry,5c54ae52-341b-499e-a6dd-67f04633b8cf,,{'brand': 'department-for-business-innovation-...,organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,hm land registry,...,,,,,,,,,,
8,/government/organisations/uk-visas-and-immigra...,04148522-b0c1-4137-b687-5f3c3bdd561a,,"{'brand': 'home-office', 'logo': {'formatted_t...",organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,uk visas and immigration,...,,,,,,,,,,
9,/government/announcements,88936763-df8a-441f-8b96-9ea0dc0758a1,find news articles speeches and statements fro...,{},finder,2016-11-14T16:28:53.000+00:00,en,,whitehall,government announcements,...,,,,,,,,,,


In [19]:
mask = content_long['taxon'].isnull()
content_long = content_long[~mask]

In [20]:
content_long = content_long.assign(taxon_id = [d['content_id'] for d in content_long['taxon']])

In [21]:
content_long = content_long.drop(['taxon'], axis=1)

In [22]:
content_long.shape[0]

336950

In [23]:
content_long.head()

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,body,combined_text,variable,taxon_id
4,/bank-holidays,58f79dbd-e57f-4ab2-ae96-96df5767d1b2,,{},calendar,2016-02-29T09:24:10.000+00:00,en,,calendars,uk bank holidays,,uk bank holidays,0,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb
5,/check-uk-visa,dc1a1744-4089-43b3-b2e3-4e397b6b15b1,you may need a visa to come to the uk to visit...,"{'more_information': ' ', 'transaction_start_l...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,check if you need a uk visa,,check if you need a uk visa you may need a vis...,0,d0e61780-6962-40aa-bb57-298f35187e4f
6,/state-pension-age,5491c439-1c83-4044-80d3-32cc3613b739,work out your state pension age and pension cr...,"{'more_information': ' ', 'transaction_start_l...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,check your state pension age,,check your state pension age work out your sta...,0,a81a7a45-1f94-4122-9547-8b485e0849cf
10,/calculate-your-holiday-entitlement,deedf6f8-389b-4b34-a5b1-faa9ef909a70,holiday calculator to work out statutory holid...,"{'more_information': ' ', 'transaction_start_l...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,calculate holiday entitlement,,calculate holiday entitlement holiday calculat...,0,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb
12,/calculate-your-child-maintenance,42c2e944-7977-4297-b142-aa9406756dd2,work out the amount of child maintenance if yo...,{'more_information': '<p>You need information ...,transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,child maintenance calculator,,child maintenance calculator work out the amou...,0,902af4ff-4a3b-4860-932a-f7d9a47c337e


In [24]:
taxons = pd.read_csv('../../data/clean_taxons.csv')


In [27]:
taxons = taxons[['base_path','content_id','taxon_name','level1taxon','level2taxon','level3taxon','level4taxon']]

In [30]:
content_taxons = pd.merge(
    left=content_long, 
    right=taxons, 
    left_on='taxon_id', 
    right_on='content_id', 
    how='outer', 
    indicator=True
)

In [31]:
content_taxons.head()

Unnamed: 0,base_path_x,content_id_x,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,...,variable,taxon_id,base_path_y,content_id_y,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon,_merge
0,/bank-holidays,58f79dbd-e57f-4ab2-ae96-96df5767d1b2,,{},calendar,2016-02-29T09:24:10.000+00:00,en,,calendars,uk bank holidays,...,0,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,/business/time-off,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,Statutory leave and time off,Business,Employing people,,,both
1,/calculate-your-holiday-entitlement,deedf6f8-389b-4b34-a5b1-faa9ef909a70,holiday calculator to work out statutory holid...,"{'more_information': ' ', 'transaction_start_l...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,calculate holiday entitlement,...,0,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,/business/time-off,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,Statutory leave and time off,Business,Employing people,,,both
2,/maternity-paternity-calculator,05d5412d-455b-485e-a570-020c9176a46e,calculate an employee’s maternity pay (smp) pa...,{'more_information': '<p>You need:</p> <ul>  ...,transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,maternity adoption and paternity calculator fo...,...,0,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,/business/time-off,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,Statutory leave and time off,Business,Employing people,,,both
3,/calculate-statutory-sick-pay,1c676a9e-0424-4ebb-bab8-d8cb8d2fc6f8,statutory sick pay (ssp) calculator - calculat...,"{'more_information': ' ', 'transaction_start_l...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,calculate your employee's statutory sick pay,...,0,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,/business/time-off,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,Statutory leave and time off,Business,Employing people,,,both
4,/training-study-work-your-rights,f50efa51-dc62-4111-aa59-731af77806aa,your rights to request time off for training o...,{'parts': [{'body': '<p>Staff may have the rig...,guide,2016-02-29T09:24:10.000+00:00,en,,publisher,training and study at work: your rights,...,0,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,/business/time-off,ffeb8bc6-2cd3-4a73-ab76-072b9a1f95fb,Statutory leave and time off,Business,Employing people,,,both


In [32]:
content_taxons.shape

(340457, 22)

In [34]:
content_long.shape

(336950, 14)

In [35]:
content_taxons['_merge'].value_counts()

both          335931
left_only       2915
right_only      1611
Name: _merge, dtype: int64

In [None]:
content_taxons[]