In [1]:
import numpy as np
import nltk
import pandas as pd
from pandas.io.json import json_normalize

import json
from bs4 import BeautifulSoup
from lxml import etree,html

import os
from collections import Counter,OrderedDict

### 1. Read data.

In [2]:
DATADIR=os.getenv('DATADIR')

In [3]:
# Get data file locations
CONTENT_INPUT = os.path.join(DATADIR, 'raw_content.json.gz')
CONTENT_OUTPUT = os.path.join(DATADIR, 'clean_content.csv')

In [4]:
#download the taxon data from content store for all links which are taxons
content = pd.read_json(
    CONTENT_INPUT, 
    orient='table', 
    typ='frame', 
    dtype=True, 
    convert_axes=True, 
    convert_dates=True, 
    keep_default_dates=True, 
    numpy=False, 
    precise_float=False, 
    date_unit=None,
    compression = 'gzip'
)

In [5]:
content.head(3)

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,taxons,title
0,/vehicle-tax,fa748fae-3de4-4266-ae85-0797ada3f40c,Renew or tax your vehicle for the first time u...,{'transaction_start_link': 'https://www.vehicl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Popular services', 'content_id': '...",Tax your vehicle
1,/student-finance-register-login,e57daef4-5eb5-431c-b0ad-14119ab0355f,Your student finance online account - check pa...,{'transaction_start_link': 'https://www.studen...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Student grants, bursaries and scho...",Student finance login
2,/government/organisations/companies-house,c36bd301-d0c5-4492-86ad-ee7843b8383b,,{'brand': 'department-for-business-innovation-...,organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,,Companies House


In [6]:
content.columns

Index(['base_path', 'content_id', 'description', 'details', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'taxons', 'title'],
      dtype='object')

Redirects dropped because they have no assigned taxons.

In [7]:
### Redirects have no taxons.
content['taxons'].loc[content['document_type']=='redirect'].isna().sum()

1689

In [8]:
print("DROP REDIRECTS")
print("BEFORE:",content.shape)
content = content[content.document_type != "redirect"]
print("AFTER:",content.shape)

DROP REDIRECTS
BEFORE: (197773, 11)
AFTER: (196084, 11)


In [9]:
ids = content["content_id"]
content[ids.isin(ids[ids.duplicated()])].sort_values('content_id')

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,taxons,title
160721,/government/publications/patent-decision-o14895,04c0a94c-5a58-42f3-a791-1f172f637d3a,Decision outcome on patent application no 9104...,{'emphasised_organisations': ['5d6f9583-991f-4...,decision,1995-10-30T00:00:00.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,"[{'title': 'Business and enterprise', 'content...",Patent decision: O/148/95
159553,/government/publications/patent-decision-o14895,04c0a94c-5a58-42f3-a791-1f172f637d3a,Decision outcome on patent application no 9104...,{'emphasised_organisations': ['5d6f9583-991f-4...,decision,1995-10-30T00:00:00.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,"[{'title': 'Business and enterprise', 'content...",Patent decision: O/148/95
159355,/government/publications/patent-decision-o02091,04c3c441-c6d1-48aa-9d73-94f7742c7372,Decision outcome on application for the revoca...,{'emphasised_organisations': ['5d6f9583-991f-4...,decision,1991-01-29T00:00:00.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,"[{'title': 'Business and enterprise', 'content...",Patent decision: O/020/91
160714,/government/publications/patent-decision-o02091,04c3c441-c6d1-48aa-9d73-94f7742c7372,Decision outcome on application for the revoca...,{'emphasised_organisations': ['5d6f9583-991f-4...,decision,1991-01-29T00:00:00.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,"[{'title': 'Business and enterprise', 'content...",Patent decision: O/020/91
160720,/government/publications/patent-journal-specia...,05082ba4-077e-4bed-ba5d-ad67365dd62b,Publication date 26 April 2017.,{'emphasised_organisations': ['5d6f9583-991f-4...,notice,2017-04-26T03:30:05.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,,Patent Journal special notices: 6675
159552,/government/publications/patent-journal-specia...,05082ba4-077e-4bed-ba5d-ad67365dd62b,Publication date 26 April 2017.,{'emphasised_organisations': ['5d6f9583-991f-4...,notice,2017-04-26T03:30:05.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,,Patent Journal special notices: 6675
90930,/guidance/saint-vincent-and-the-grenadines-mig...,050997c8-f7cc-48de-bebb-285067b3b6a0,Advice and guidance on the health needs of mig...,{'emphasised_organisations': ['1343f283-19e9-4...,detailed_guide,2014-07-30T23:00:00.000+00:00,en,{'title': 'Public Health England'},whitehall,"[{'title': 'Public health', 'content_id': 'd6a...",Saint Vincent and the Grenadines: migrant heal...
90918,/guidance/saint-vincent-and-the-grenadines-mig...,050997c8-f7cc-48de-bebb-285067b3b6a0,Advice and guidance on the health needs of mig...,{'emphasised_organisations': ['1343f283-19e9-4...,detailed_guide,2014-07-30T23:00:00.000+00:00,en,{'title': 'Public Health England'},whitehall,"[{'title': 'Public health', 'content_id': 'd6a...",Saint Vincent and the Grenadines: migrant heal...
160715,/government/publications/patent-decision-o19092,0584f7ec-4af6-4e72-bc3f-c9ee7dddae25,Decision outcome on application for revocation...,{'emphasised_organisations': ['5d6f9583-991f-4...,decision,1992-11-19T00:00:00.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,"[{'title': 'Business and enterprise', 'content...",Patent decision: O/190/92
159356,/government/publications/patent-decision-o19092,0584f7ec-4af6-4e72-bc3f-c9ee7dddae25,Decision outcome on application for revocation...,{'emphasised_organisations': ['5d6f9583-991f-4...,decision,1992-11-19T00:00:00.000+00:00,en,{'title': 'Intellectual Property Office'},whitehall,"[{'title': 'Business and enterprise', 'content...",Patent decision: O/190/92


### 2. Extract page text from nested json.

Some preliminary counting, nothing that should be ultimately run.

In [10]:
total_tags = Counter()
total_types = Counter()
sub_types = Counter()

In [11]:
r = json.dumps(content['details'][0])
loaded_r = json.loads(r)
print(loaded_r.keys())

dict_keys(['transaction_start_link', 'department_analytics_profile', 'will_continue_on', 'introductory_paragraph', 'other_ways_to_apply', 'external_related_links', 'start_button_text'])


#### 2.1 Bool checks for json/html.

Checks for json and html strings.

In [12]:
def is_json(raw_text):
    try:
        json_normalize(raw_text).columns.tolist()
    except AttributeError:
#         print("exception @ success:",raw_text)
        return False
    return True

def is_html(raw_text):
#     print(raw_text)
#     print(type(raw_text))
    return html.fromstring(str(raw_text)).find('.//*') is not None

#### 2.2 Main function for text extraction.

Tags of main interest, manually discovered.

In [13]:
# look = ['title', 'note', 'text', 'preposition', 'change_note', 'child_sections', 'abbreviation', 'headers']
look = ['text', 'child_sections', 'headers']
child_keys = ['title','description']

Heuristics include length of strings and json keys listed above.

In [14]:
def get_text(x):
    total_text = ""
    ### From dict to json and back (to OrderedDict).
    string_json = json.dumps(OrderedDict(x))
    order_json = json.loads(string_json,object_pairs_hook=OrderedDict)
    ### Iterate over json from details.
    for key,raw_text in order_json.items():
        if isinstance(raw_text,str) and len(raw_text)>2:  
            raw_token = raw_text.split(" ")
            if len(raw_token)>1:
                raw_string = extract_text(raw_text)
                total_text += " " + raw_string
        elif isinstance(raw_text,list) and len(raw_text)>0:
            for sub_text in raw_text:
                if is_json(sub_text):
                    string_json2 = json.dumps(OrderedDict(sub_text))
                    order_json2 = json.loads(string_json2,object_pairs_hook=OrderedDict)
                    if 'body' in order_json2.keys() and \
                                isinstance(order_json2['body'],str):
                            raw_string2 = extract_text(order_json2['body'])
                            if len(raw_string2.split(" ")) > 10:
                                total_text += " " + raw_string2
                    elif 'child_sections' in order_json2.keys(): 
#                             print(type(order_json2['child_sections']),len(order_json2['child_sections']))
                            for child in order_json2['child_sections']:
                                for key in child_keys:
                                    total_text += " " + child[key]
                elif is_html(sub_text):
                    str_from_html = extract_text(sub_text)
                    total_text += " " + str_from_html
    return total_text.strip()

#### 2.3 Function for html extraction.

In [15]:
# Clean the html

def extract_text(body):
    r = None
    #if lxml.html.fromstring(html).find('.//*') is not None:
    # This is horribly hacky. Previously this was failing on single new line
    # characters.
    if body and len(body)>1:
        try:
            tree = etree.HTML(body)
#             print(">>>",tree)
            r = tree.xpath('//text()')
            r = ' '.join(r)
            r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            r = r.replace('\n', ' ').replace(',', ' ')
            r = r.lower()
            r = ' '.join(r.split())
        except ValueError:
            print("exception @ extract:",type(body),body)
    if not r:
        r = ' '
    return r

#### 2.4 Map details to dataset `body` column. Takes a while.

In [16]:
content['body']= content['details'].map(get_text)

In [17]:
content.head(3)

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,taxons,title,body
0,/vehicle-tax,fa748fae-3de4-4266-ae85-0797ada3f40c,Renew or tax your vehicle for the first time u...,{'transaction_start_link': 'https://www.vehicl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Popular services', 'content_id': '...",Tax your vehicle,tax your car motorcycle or other vehicle using...
1,/student-finance-register-login,e57daef4-5eb5-431c-b0ad-14119ab0355f,Your student finance online account - check pa...,{'transaction_start_link': 'https://www.studen...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Student grants, bursaries and scho...",Student finance login,the student finance england website you’ll be ...
2,/government/organisations/companies-house,c36bd301-d0c5-4492-86ad-ee7843b8383b,,{'brand': 'department-for-business-innovation-...,organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,,Companies House,


In [18]:
content.shape

(196084, 12)

### 4. Add `length`, `title`, `description` and `combined_text` columns.
Column contents:
* Length: referring to number of words in page.
* Title: referring to title of page.
* Description: referring to description page (field may be null, take care).
* Combined_text: Concatenate `title`, `description` and `body`.

#### 4.1 Length.

In [20]:
content['length'] = content['body'].map(lambda x: len(x.split()))

#### 4.2 Title.

In [21]:
content = content.assign(title = content['title'].apply(extract_text))

#### 4.3 Description.

In [22]:
content = content.assign(description = content['description'].apply(extract_text))

#### 4.4 Combined text.

In [23]:
content['combined_text'] = content['title'] + ' ' + content['description'] + ' ' + content['body']

In [31]:
find_basic_info(0)

BASE PATH:
 https://www.gov.uk/api/content/vehicle-tax 
=====
TITLE:
 tax your vehicle 
=====
DESCRIPTION:
 renew or tax your vehicle for the first time using a reminder letter your log book the 'new keeper's details' section of a log book - and how to tax if you don't have any documents 
=====
DETAILS:
=====
BODY:
=====


### 5. Output to csv.

In [None]:
content['taxons'] = content['taxons'].where((pd.notnull(content['taxons'])), None)

In [None]:
content.to_csv("clean_content_enrich_test.csv",index=False)

In [None]:
content = pd.read_csv("clean_content_test.csv")

In [None]:
content.head()

### 4. Taxon work.

In [None]:
content.columns

In [None]:
content_columns = content.drop(['taxons'], axis=1).columns.values

In [None]:
content_wide = pd.concat([content.drop('taxons', axis=1), content['taxons'].apply(pd.Series)], axis=1)

In [None]:
content_long = pd.melt(content_wide, id_vars=content_columns, value_name='taxon')

In [None]:
content_long.columns

In [None]:
content_long['taxon'][4]

In [None]:
content_wide[0:10]

In [None]:
mask = content_long['taxon'].isnull()
content_long = content_long[~mask]

In [None]:
content_long = content_long.assign(taxon_id = [d['content_id'] for d in content_long['taxon']])

In [None]:
content_long = content_long.drop(['taxon'], axis=1)

In [None]:
content_long.shape[0]

In [None]:
content_long.head()

In [None]:
taxons = pd.read_csv('../../data/clean_taxons.csv')

In [None]:
taxons = taxons[['base_path','content_id','taxon_name','level1taxon','level2taxon','level3taxon','level4taxon']]

In [None]:
content_taxons = pd.merge(
    left=content_long, 
    right=taxons, 
    left_on='taxon_id', 
    right_on='content_id', 
    how='outer', 
    indicator=True
)

In [None]:
content_taxons.head()

In [None]:
content_taxons.shape

In [None]:
content_long.shape

In [None]:
content_taxons['_merge'].value_counts()

In [None]:
list(content_taxons.columns.values)

### 5. Testing.

In [None]:
body_sample = content['body'][5]
print("https://www.gov.uk"+content['base_path'][5],"\n",body_sample)
print(len(body_sample),body_sample.split(" "))

In [None]:
content['body'].loc[(content['length']>=1) & (content['length']<=6)]

In [None]:
content.shape

#### 5.1 Check out main page features.

In [30]:
### REF 2023, 8975
def find_basic_info(i):
    print("BASE PATH:\n https://www.gov.uk/api/content"+content['base_path'][i],"\n=====")
    print("TITLE:\n",content['title'][i],"\n=====")
    print("DESCRIPTION:\n",content['description'][i],"\n=====")
    print("DETAILS:\n",content['details'][i],"\n=====")
    print("BODY:\n",content['body'][i],"\n=====")

In [None]:
find_basic_info(8975)

#### 5.2 Empty text.

In [None]:
empty_text = Counter()
base_paths = []

In [None]:
for tup in content[0:10].itertuples():
    text = get_text(tup[4])
    if len(text)==0:
        empty_text[tup[5]]+=1
        base_paths.append("https://www.gov.uk"+tup[1])   

In [None]:
print(len(base_paths))
empty_text

In [None]:
for i,(key,value) in zip(range(1,len(empty_text)+1),sorted(empty_text.items(), key=lambda pair: pair[1], reverse=True)):
    print(str(i)+".",key+":",value)

In [None]:
potential_sub_types

Counter({'abbreviation': 57,
         'allowed_values': 224,
         'base_path': 22710,
         'body': 16021,
         'change_note': 22710,
         'child_sections': 529,
         'content_id': 1646,
         'content_type': 1646,
         'created_at': 1646,
         'display_as_result_metadata': 1568,
         'documents': 10550,
         'filterable': 1568,
         'headers': 932,
         'id': 11890,
         'key': 1568,
         'kind': 1427,
         'level': 11890,
         'name': 896,
         'note': 283201,
         'options': 1427,
         'preposition': 672,
         'public_timestamp': 283201,
         'published_at': 22710,
         'section_id': 20483,
         'short_name': 448,
         'slug': 5873,
         'text': 11890,
         'title': 41303,
         'type': 1120,
         'updated_at': 1646,
         'url': 1794,
         'web_url': 57})

### Out of 1000 rows

Counter({'abbreviation': 3,
         'base_path': 220,
         'body': 2258,
         'change_note': 220,
         'content_id': 1,
         'content_type': 1,
         'created_at': 1,
         'id': 1,
         'key': 21,
         'kind': 636,
         'name': 12,
         'note': 3873,
         'preposition': 9,
         'public_timestamp': 3873,
         'published_at': 220,
         'short_name': 6,
         'slug': 2260,
         'text': 1,
         'title': 2766,
         'type': 15,
         'updated_at': 1,
         'url': 78,
         'web_url': 3})

Counter({NoneType: 52752,
         list: 979755,
         numpy.bool_: 432006,
         numpy.int64: 2704,
         str: 1061581})

In [None]:
type(content['details'][0])

#### 5.3 Metrics for text content.

In [25]:
content['title_len'] = content['title'].map(lambda x: len(x.split()))
content['description_len'] = content['description'].map(lambda x: len(x.split()))
content.head()

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,taxons,title,body,title_len,length,combined_text,description_len
0,/vehicle-tax,fa748fae-3de4-4266-ae85-0797ada3f40c,renew or tax your vehicle for the first time u...,{'transaction_start_link': 'https://www.vehicl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Popular services', 'content_id': '...",tax your vehicle,tax your car motorcycle or other vehicle using...,3,301,tax your vehicle renew or tax your vehicle for...,36
1,/student-finance-register-login,e57daef4-5eb5-431c-b0ad-14119ab0355f,your student finance online account - check pa...,{'transaction_start_link': 'https://www.studen...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Student grants, bursaries and scho...",student finance login,the student finance england website you’ll be ...,3,162,student finance login your student finance onl...,20
2,/government/organisations/companies-house,c36bd301-d0c5-4492-86ad-ee7843b8383b,,{'brand': 'department-for-business-innovation-...,organisation,2016-02-29T09:24:10.000+00:00,en,,whitehall,,companies house,,2,0,companies house,0
3,/get-information-about-a-company,9ca1a27b-af7b-44d2-b10d-0a6d0e3ff53d,get company information including registered a...,{'transaction_start_link': 'https://beta.compa...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Business debt and bankruptcy', 'co...",get information about a company,you can get some details about a company for f...,5,123,get information about a company get company in...,21
4,/check-vehicle-tax,0889f128-e479-465f-b3e1-a3db6a3879cf,check and report if a vehicle has up-to-date v...,{'transaction_start_link': 'https://www.vehicl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,"[{'title': 'Popular services', 'content_id': '...",check if a vehicle is taxed,you’ll need the number plate (registration num...,6,63,check if a vehicle is taxed check and report i...,15


In [None]:
print(content['title_len'].describe(),content['description_len'].describe())
print(content['title_len'].sum(),content['description_len'].sum())

In [None]:
print(content['base_path'].loc[content['title_len']>40].values[0])