In [119]:
import pandas as pd
import numpy as np
import os
import operator
import time

## 1. Read data.

In [120]:
DATADIR=os.getenv('DATADIR')
print(DATADIR)
DATAPATH = os.path.join(DATADIR, 'labelled_level2.csv.gz')

/Users/felisialoukou/Documents/govuk-taxonomy-supervised-learning/data


In [121]:
data = pd.read_csv(DATAPATH,compression='gzip')
data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(173560, 21)

In [122]:
data.head()

Unnamed: 0,base_path,content_id,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,...,body,combined_text,taxon_id,taxon_base_path,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon,level5taxon
0,/student-finance-register-login,e57daef4-5eb5-431c-b0ad-14119ab0355f,your student finance online account - check pa...,{'will_continue_on': 'the Student Finance Engl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,student finance login,...,,student finance login your student finance onl...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,"Student grants, bursaries and scholarships",,
1,/student-finance-calculator,434b6eb5-33c8-4300-aba3-f5ead58600b8,student finance calculator - get a quick estim...,{'introductory_paragraph': '<p>This calculator...,transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,student finance calculator,...,,student finance calculator student finance cal...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,"Student grants, bursaries and scholarships",,
2,/apply-online-for-student-finance,83155b50-418e-437c-9389-cf0e1302900f,apply online for student finance - online appl...,{'will_continue_on': 'the Student Finance Engl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,apply online for student finance,...,,apply online for student finance apply online ...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,"Student grants, bursaries and scholarships",,
3,/student-finance,d38bafd3-2c46-4be2-b50b-50c2ba7d30ed,student finance - student loans or student gra...,"{'parts': [{'title': 'Overview', 'slug': 'over...",guide,2016-02-29T09:24:10.000+00:00,en,,publisher,student finance,...,,student finance student finance - student loan...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,"Student grants, bursaries and scholarships",,
4,/apply-for-student-finance,06017464-d3e2-4a4c-8bef-250eff0de7e4,how to apply for student finance - when to app...,"{'parts': [{'title': 'How to apply', 'slug': '...",guide,2016-02-29T09:24:10.000+00:00,en,,publisher,student finance: how to apply,...,,student finance: how to apply how to apply for...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,"Student grants, bursaries and scholarships",,


### 1.2 UTILS.

In [123]:
data.columns

Index(['base_path', 'content_id', 'description', 'details', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'document_type_gp', 'body', 'combined_text',
       'taxon_id', 'taxon_base_path', 'taxon_name', 'level1taxon',
       'level2taxon', 'level3taxon', 'level4taxon', 'level5taxon'],
      dtype='object')

In [124]:
def build_index(x):
    index_dict = {}
    index_dict['index'] = 0
    for i,elem in enumerate(x):
        index_dict[elem] = i+1
    return index_dict

In [125]:
ind = build_index(data.columns)
print(ind['level2taxon'])

18


### 2. Taxons

In [126]:
data['level2taxon'].nunique()

440

In [127]:
#COLLAPSE World level2taxons
data.loc[data['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

#creating categorical variable for level2taxons from values
data['level2taxon'] = data['level2taxon'].astype('category')

#Get the category numeric values (codes) and avoid zero-indexing
labels = data['level2taxon'].cat.codes + 1

#create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labels), data['level2taxon']))
labels_index

{1: 'Administrative justice reform',
 2: 'Adoption, fostering and surrogacy',
 3: 'Afghanistan',
 4: 'Armed Forces Covenant',
 5: 'Armed forces',
 6: 'Armed forces and Ministry of Defence reform',
 7: 'Armed forces support for activities in the UK',
 8: 'Arts and culture',
 9: 'Assessing environmental impact',
 10: 'Asylum',
 11: 'Attorney General guidance to the legal profession',
 12: 'Aviation',
 13: 'Benefits entitlement',
 14: 'Benefits for families',
 15: 'Biodiversity and ecosystems',
 16: 'Boating and inland waterways',
 17: 'Brexit',
 18: 'Brexit and the EU',
 19: 'British citizenship ',
 20: 'British nationals overseas',
 21: 'Business and enterprise',
 22: 'Business and the environment',
 23: 'Business tax',
 24: 'Byelaws',
 25: 'Carers and disability benefits',
 26: "Carers' health",
 27: 'Certificates, register offices, changes of name or gender',
 28: 'Charities, volunteering and honours',
 29: 'Child Benefit',
 30: 'Child maintenance reform',
 31: 'Childcare and early ye

In [128]:
data['level1taxon'] = data['level1taxon'].astype('category')

In [129]:
taxon_dict = data['level2taxon'].value_counts()
taxon_dict

Business and enterprise                                         11717
Government efficiency, transparency and accountability          11049
UK economy                                                      10527
Trade and investment                                             7229
Public health                                                    6156
National security                                                5765
Climate change and energy                                        5675
Foreign affairs                                                  5010
National Health Service                                          4970
Community and society                                            4334
Housing                                                          3996
Wildlife and animal welfare                                      3944
Housing planning and building                                    3826
International aid and development                                3780
Science and innovati

In [130]:
len(taxon_dict)

210

In [131]:
data['level1taxon'].nunique()

20

### Multiple parents.

In [62]:
from collections import Counter
children = {}
parent_count = Counter()
for tup in data.itertuples():
    if not tup[18] in children.keys():
        children[tup[18]] = tup[17]
        parent_count[tup[18]]+=1
    else:
        if parent_count[tup[18]]>1: 
            print("parent woes")
            children[tup[18]+str(parent_count[tup[18]])] = tup[17]

In [61]:
len(parent_count)

210

### Count support per taxon.
Identify taxons with problematic support:
1. Less than 10
2. Less than 50 
3. Less than 100
4. Less than 500 (threshold)

In [151]:
dict_of_dicts = {}
support_thresh = [0,10,50,100,500]
ttotal = 0
tcounter = 0
for i,s in enumerate(support_thresh):
    sub_dict = {}
    counter = 0
    total = 0
    if i < len(support_thresh)-1:
        s1 = s
        s2 = support_thresh[i+1]
        for key,value in taxon_dict.items():
            if value > s1 and value <= s2 :
                sub_dict[key] = value
                total+=value
                counter+=1
        dict_of_dicts[s2] = sub_dict
        print("Support threshold > "+str(s1)+" and <= " +str(s2)+": "+str(counter)+" taxons")
        print("Total pages tagged to taxon:",total)
        ttotal+=total
        tcounter+=counter
        
print(ttotal,tcounter)

Support threshold > 0 and <= 10: 25 taxons
Total pages tagged to taxon: 129
Support threshold > 10 and <= 50: 52 taxons
Total pages tagged to taxon: 1382
Support threshold > 50 and <= 100: 28 taxons
Total pages tagged to taxon: 2027
Support threshold > 100 and <= 500: 45 taxons
Total pages tagged to taxon: 11211
14749 150


In [150]:
len(dict_of_dicts)

4

In [155]:
dict_of_dicts[10].keys()

dict_keys(['Expenses and employee benefits', 'HS2 and the environment', 'What you can bring to the UK', 'Assessing environmental impact', 'Death and benefits', 'Voting', 'Nuclear disarmament', 'Secondments with government', 'Legislative process', 'Medical certification and advice', 'Forced marriage', 'Devolution', 'Public sector land use', 'Universal Credit', 'Boating and inland waterways', 'Civil justice reform', 'Work and disabled people', 'Domestic violence', 'UK border control', 'Content and publishing', 'Statutory rights', 'Criminal record disclosure', 'Byelaws', 'Diplomats', 'Government graduate schemes'])

In [160]:
for i in [10,50,100,500]:
    print(i)
    for key,value in sorted(dict_of_dicts[i].items(), key=operator.itemgetter(1),reverse=True):
        print(key,"\t",value)
    print("======")

10
Death and benefits 	 10
Voting 	 9
UK border control 	 9
Expenses and employee benefits 	 8
HS2 and the environment 	 8
Work and disabled people 	 8
What you can bring to the UK 	 7
Nuclear disarmament 	 7
Medical certification and advice 	 7
Assessing environmental impact 	 6
Devolution 	 5
Universal Credit 	 5
Boating and inland waterways 	 5
Content and publishing 	 5
Legislative process 	 4
Criminal record disclosure 	 4
Diplomats 	 4
Civil justice reform 	 3
Statutory rights 	 3
Government graduate schemes 	 3
Secondments with government 	 2
Forced marriage 	 2
Public sector land use 	 2
Byelaws 	 2
Domestic violence 	 1
50
Youth employment and social issues 	 48
Having a child, parenting and adoption 	 47
Postal service reform 	 47
Courts, sentencing and tribunals 	 47
Transport planning 	 47
Disabled people 	 47
Starting and attending school 	 42
Payroll 	 40
Money laundering regulations 	 39
Reporting crimes and getting compensation 	 38
Passports and travel documents for fo

### Recent Taxon stuff.

In [112]:
taxon_hierarchy = pd.read_csv('/Users/felisialoukou/Downloads/Copy of Sitewide taxonomy (31.01.2018) \
- Sitewide taxonomy 31.01.2018.tsv', sep='\t')

In [113]:
taxon_hierarchy.head()

Unnamed: 0,Level 1,Level 2,Level 3,Level 4,Level 5
0,World,,,,
1,,Barbados and the UK,,,
2,,,British embassy or high commission,,
3,,,Trade and invest,,
4,,,News and events,,


In [114]:
taxon_hierarchy['Level 2'].nunique()

448

In [115]:
counter = 1
for key in sub_dict.keys():
#     print(key)
    data = taxon_hierarchy.loc[taxon_hierarchy['Level 2'] == key]
    if data.empty:
        print(counter,key)
        counter+=1

1 UK border control
2 Land Registration Data 
3 British citizenship 
4 Conflict in fragile states
5 Asylum
6 Transport research and evaluation
7 Visas
8 Brexit and the EU
9 What you can bring to the UK
10 Passports and travel documents for foreign nationals
11 Child Benefit
12 Diplomats
13 Immigration rules and enforcement


In [82]:
taxon_hierarchy.loc[taxon_hierarchy['Level 2'] == "Barbados and the UK"]

Unnamed: 0,Level 1,Level 2,Level 3,Level 4,Level 5
1,,Barbados and the UK,,,


## 2. Counts

In [5]:
data['title_len'] = data['title'].map(lambda x: len(x.split()))
data['description_len'] = data['description'].map(lambda x: len(x.split()))
data['body_len'] = data['body'].map(lambda x: len(x.split()) if not isinstance(x,float) else 0)
data['combined_text_len'] = data['combined_text'].map(lambda x: len(x.split()))

In [6]:
len_cols = ['title_len','description_len','body_len','combined_text_len']

In [7]:
for col in len_cols:
    print(data[col].describe(),"\n=====")

count    173560.000000
mean          7.988482
std           3.214716
min           1.000000
25%           6.000000
50%           8.000000
75%          10.000000
max          38.000000
Name: title_len, dtype: float64 
=====
count    173560.000000
mean         17.732179
std           6.971012
min           0.000000
25%          13.000000
50%          18.000000
75%          21.000000
max         149.000000
Name: description_len, dtype: float64 
=====
count    173560.000000
mean        331.267758
std         612.127736
min           0.000000
25%          38.000000
50%         118.000000
75%         396.000000
max       31129.000000
Name: body_len, dtype: float64 
=====
count    173560.000000
mean        356.988419
std         612.851116
min           2.000000
25%          63.000000
50%         144.000000
75%         423.000000
max       31145.000000
Name: combined_text_len, dtype: float64 
=====
