In [1]:
import pandas as pd
import numpy as np
import os
import operator

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

  return f(*args, **kwds)


## Environmental vars

In [2]:
P_THRESHOLD=0.5

In [3]:
DATADIR=os.getenv('DATADIR')
RESULTS_DIR = os.path.join(DATADIR, "2018-03-12")
RESULTS_DIR

'/Users/felisialoukou/Documents/govuk-taxonomy-supervised-learning/data/2018-03-12'

## Get some data about taxons/content

In [4]:
labelled_level2 = pd.read_csv(
    os.path.join(RESULTS_DIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [5]:
# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [6]:
# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

In [55]:
labelled_level2.shape

(123900, 32)

In [56]:
labelled_level2.drop_duplicates(subset='content_id', keep='first', inplace=True)

In [57]:
dict_pub = labelled_level2['primary_publishing_organisation'].value_counts()

In [58]:
for key,value in dict_pub.items():
    if isinstance(key,str) and "Adju" in key:
        print(key)

Office of the Schools Adjudicator
Groceries Code Adjudicator
Pubs Code Adjudicator


In [102]:
list_org = ['Valuation Office Agency','HM Revenue & Customs','District Valuer Services (DVS)']

In [60]:
labelled_level2.columns

Index(['base_path', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'email_document_supertype',
       'first_published_at', 'government_document_supertype', 'locale',
       'navigation_document_supertype', 'public_updated_at', 'publishing_app',
       'search_user_need_document_supertype', 'title', 'updated_at',
       'user_journey_document_supertype', 'document_type_gp',
       'primary_publishing_organisation', 'body', 'combined_text', 'taxon_id',
       'taxon_base_path', 'taxon_name', 'level1taxon', 'level2taxon',
       'level3taxon', 'level4taxon', 'level5taxon', 'level2taxon_code'],
      dtype='object')

In [61]:
for org in list_org:
    print(org,":",labelled_level2.loc[(labelled_level2['primary_publishing_organisation']==org) & 
                                      (labelled_level2['level3taxon'].isnull())].shape[0])

Valuation Office Agency : 44
HM Revenue & Customs : 663
District Valuer Services (DVS) : 4


In [None]:
labelled_level2['level3taxon'].isnull()

In [8]:
path = "/Users/felisialoukou/Downloads/hm-revenue-customs_and_descendants.txt"

In [None]:
lines = []

In [64]:
total = 0
with open(path,'r') as f:
    for line in f: 
        token_line = line.strip("\n").split(" ")
#         print(token_line)
        for i,token in enumerate(token_line):
            local_count = 0
            if token.isdigit():
#                 print(token_line[i])
                for j in range(i+1,len(token_line)):
#                     print("j",j)
                    if token_line[j] == "":
                        local_count+=1
#                         print("Spaces in:",local_count)
                    else:
#                         print('found word',token_line[j])
                        break
            if local_count == 4:
                something = line.strip("\n").split()
                print("found lvl2",token_line,something[0])
                total+= int(something[0])
                break
total

found lvl2 ['', '', '', '1', '', '', '', '', 'Welfare', 'reform'] 1
found lvl2 ['', '', '38', '', '', '', '', 'Tax', 'credits'] 38
found lvl2 ['', '', '26', '', '', '', '', 'Child', 'Benefit', '(welfare', 'theme)'] 26
found lvl2 ['', '', '', '1', '', '', '', '', 'Arts', 'and', 'culture'] 1
found lvl2 ['', '', '', '2', '', '', '', '', 'Community', 'and', 'society'] 2
found lvl2 ['', '', '', '1', '', '', '', '', 'Sports', 'and', 'leisure'] 1
found lvl2 ['', '', '', '2', '', '', '', '', 'Rural', 'and', 'countryside'] 2
found lvl2 ['', '', '', '0', '', '', '', '', 'Oil', 'and', 'gas'] 0
found lvl2 ['', '', '40', '', '', '', '', 'Climate', 'change', 'and', 'energy'] 40
found lvl2 ['', '', '', '1', '', '', '', '', 'Food', 'and', 'farming'] 1
found lvl2 ['', '', '', '1', '', '', '', '', 'Freight', 'and', 'cargo'] 1
found lvl2 ['', '', '', '1', '', '', '', '', 'Transport', 'security'] 1
found lvl2 ['', '', '', '0', '', '', '', '', 'Driving', 'and', 'road', 'transport'] 0
found lvl2 ['', '', ''

653

In [65]:
clean_content = pd.read_csv(
    os.path.join(RESULTS_DIR, 'clean_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [66]:
clean_content.head()

Unnamed: 0,base_path,content_id,content_purpose_document_supertype,content_purpose_subgroup,content_purpose_supergroup,description,details,document_type,email_document_supertype,first_published_at,...,search_user_need_document_supertype,title,updated_at,user_journey_document_supertype,document_type_gp,primary_publishing_organisation,body,combined_text,variable,taxon_id
0,/government/news/uk-reaffirms-commitment-to-wo...,fa30b2ae-44b7-4f71-88ca-7e4f432d9c17,news,news,news_and_communications,nearly a month after the uk voted to leave the...,"{'body': '<div class=""govspeak""><p>The United ...",world_news_story,announcements,2016-08-16 19:48:45.000,...,government,uk reaffirms commitment to work with honduras ...,2018-03-09 05:35:25.799,thing,news_and_announcements,,the united kingdom will remain a committed par...,uk reaffirms commitment to work with honduras ...,0,21eee04d-e702-4e7b-9fde-2f6777f1be2c
1,/government/news/thirty-greek-students-receive...,e6789cf4-a6d8-4a4e-ab10-cc4c2ea8c08b,news,news,news_and_communications,the students were presented the award at a cer...,"{'body': '<div class=""govspeak""><p>Thirty stud...",world_news_story,announcements,2016-09-01 11:24:00.000,...,government,thirty greek students receive the diana award ...,2018-03-08 18:04:30.518,thing,news_and_announcements,,thirty students from rethymnon crete members o...,thirty greek students receive the diana award ...,0,668cd623-c7a8-4159-9575-90caac36d4b4
2,/government/news/uk-and-bulgaria-discuss-the-b...,ea8bf01a-b528-4ced-9b7d-fda5f2b1f116,news,news,news_and_communications,british ambassador jonathan allen the head of ...,"{'body': '<div class=""govspeak""><p>On 27 Novem...",world_news_story,announcements,2014-11-27 13:09:02.000,...,government,uk and bulgaria discuss the benefits of a refo...,2018-03-08 17:27:38.345,thing,news_and_announcements,,on 27 november british embassy sofia organised...,uk and bulgaria discuss the benefits of a refo...,0,3afd1d79-597d-4f97-bc7a-83766dcab2f4
3,/government/news/eid-message-video-by-mr-thoma...,db4aaa38-ba7d-4008-a88a-861a5579bc35,news,news,news_and_communications,british ambassador to morocco mr thomas reilly...,"{'body': '<div class=""govspeak""><p><a rel=""ext...",world_news_story,announcements,2017-07-03 13:08:00.000,...,government,eid message by hma thomas reilly,2018-03-09 05:11:34.932,thing,news_and_announcements,,ambassador eid on the occasion of eid al fitr ...,eid message by hma thomas reilly british ambas...,0,668cd623-c7a8-4159-9575-90caac36d4b4
4,/government/news/uk-food-drink-companies-visit...,52956505-d855-4dca-8d71-c2347f189100,news,news,news_and_communications,uk trade and investment (ukti) india in partne...,"{'body': '<div class=""govspeak""><p>The delegat...",world_news_story,announcements,2016-01-19 10:32:00.000,...,government,uk food & drink companies visit india looking ...,2018-03-08 20:19:55.729,thing,news_and_announcements,,the delegation of 15 companies brings speciali...,uk food & drink companies visit india looking ...,0,52ff5c99-a17b-42c4-a9d7-2cc92cccca39


In [67]:
for org in list_org:
    print(org,":",clean_content.loc[(clean_content['primary_publishing_organisation']==org)].shape[0])

Valuation Office Agency : 96
HM Revenue & Customs : 11621
District Valuer Services (DVS) : 4


In [68]:
dict_pub_all = clean_content['primary_publishing_organisation'].value_counts()

for key,value in dict_pub.items():
    if isinstance(key,str) and "Adju" in key:
        print(key)

Office of the Schools Adjudicator
Groceries Code Adjudicator
Pubs Code Adjudicator


In [69]:
clean_content.columns

Index(['base_path', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'email_document_supertype',
       'first_published_at', 'government_document_supertype', 'locale',
       'navigation_document_supertype', 'public_updated_at', 'publishing_app',
       'search_user_need_document_supertype', 'title', 'updated_at',
       'user_journey_document_supertype', 'document_type_gp',
       'primary_publishing_organisation', 'body', 'combined_text', 'variable',
       'taxon_id'],
      dtype='object')

In [70]:
content = pd.read_json(
    os.path.join(RESULTS_DIR, 'content.json.gz'),
    compression='gzip',
    orient='table',
    typ='frame',
    dtype=True,
    convert_axes=True,
    convert_dates=True,
    keep_default_dates=True,
    numpy=False,
    precise_float=False,
    date_unit=None
)

In [71]:
content['links'].iloc[0]

{'available_translations': [{'api_path': '/api/content/government/news/uk-reaffirms-commitment-to-work-with-honduras-after-referendum-result',
   'api_url': 'https://www-origin.integration.publishing.service.gov.uk/api/content/government/news/uk-reaffirms-commitment-to-work-with-honduras-after-referendum-result',
   'base_path': '/government/news/uk-reaffirms-commitment-to-work-with-honduras-after-referendum-result',
   'content_id': 'fa30b2ae-44b7-4f71-88ca-7e4f432d9c17',
   'description': 'Nearly a month after the UK voted to leave the European Union (EU), the British Ambassador to Honduras, Carolyn Davidson, reiterated the UK’s commitment to continue working with Honduras on a series of shared goals.',
   'document_type': 'world_news_story',
   'links': {},
   'locale': 'en',
   'public_updated_at': '2016-08-16T19:48:45Z',
   'schema_name': 'news_article',
   'title': 'UK reaffirms commitment to work with Honduras after referendum result',
   'web_url': 'https://www-origin.integrati

In [104]:
def org_mapper(x_links):
    if 'organisations' in x_links:
        return x_links['organisations'][0]['title']
    
def pub_mapper(x_links):
    if 'primary_publishing_organisation' in x_links:
        return x_links['primary_publishing_organisation'][0]['title']
    else:
        return np.NaN

In [105]:
content['organisations'] = content['links'].map(org_mapper)
content['primary_publishing_organisation'] = content['links'].map(pub_mapper)

In [106]:
dict_organisations = content['organisations'].value_counts()
prim_organisations = content['primary_publishing_organisation'].value_counts()

In [108]:
len(prim_organisations)

426

In [101]:
for item in list_org:
    print(item,(item in dict_organisations.keys()))

Valuation Office Agency True
HM Revenue & Customs True
District Valuer Services (DVS) True
The Adjudicator’s Office False


In [116]:
for org in list_org:
    sum_org = 0 
    sum_org = content.loc[content['organisations']==org].shape[0] + content.loc\
                    [content['primary_publishing_organisation']==org].shape[0]
    print(org,":",sum_org)

Valuation Office Agency : 785
HM Revenue & Customs : 14061
District Valuer Services (DVS) : 14


In [126]:
content['document_type'].loc[(content['primary_publishing_organisation'].isnull()) \
            & (content['organisations'].isnull())].value_counts()

world_news_story                        8716
placeholder_person                      1368
person                                  1352
redirect                                1223
organisation                             998
working_group                            603
worldwide_organisation                   307
esi_fund                                 304
taxon                                    223
placeholder_world_location_news_page     222
placeholder_ministerial_role              99
placeholder_policy_area                   42
placeholder_topical_event                 31
topical_event                             23
ministerial_role                          20
take_part                                 17
transaction                               17
licence                                   12
guide                                     11
special_route                             10
news_story                                 9
answer                                     7
finder    

In [None]:
Primary Publishing Organisation

Valuation Office Agency : 374
HM Revenue & Customs : 6645
District Valuer Services (DVS) : 8
    
Organisations

Valuation Office Agency : 411
HM Revenue & Customs : 7416
District Valuer Services (DVS) : 6
    
Valuation Office Agency : 785
HM Revenue & Customs : 14061
District Valuer Services (DVS) : 14