In [2]:
import pandas as pd
import numpy as np
import os

## Environmental vars

In [3]:
DATADIR=os.getenv('DATADIR')
RESULTS_DIR = os.path.join(DATADIR, "2018-03-12")
RESULTS_DIR

'/Users/felisialoukou/Documents/govuk-taxonomy-supervised-learning/data/2018-03-12'

## Get some data about taxons/content

In [4]:
labelled_level2 = pd.read_csv(
    os.path.join(RESULTS_DIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [5]:
# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [6]:
# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

In [7]:
labelled_level2.shape

(172345, 32)

In [8]:
labelled_level2.drop_duplicates(subset='content_id', keep='first', inplace=True)

In [9]:
dict_pub = labelled_level2['primary_publishing_organisation'].value_counts()

In [10]:
for key,value in dict_pub.items():
    if isinstance(key,str) and "Adju" in key:
        print(key)

Office of the Schools Adjudicator
Groceries Code Adjudicator
Pubs Code Adjudicator


In [11]:
list_org = ['Valuation Office Agency','HM Revenue & Customs','District Valuer Services (DVS)']

In [12]:
labelled_level2.columns

Index(['base_path', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'email_document_supertype',
       'first_published_at', 'government_document_supertype', 'locale',
       'navigation_document_supertype', 'public_updated_at', 'publishing_app',
       'search_user_need_document_supertype', 'title', 'updated_at',
       'user_journey_document_supertype', 'document_type_gp',
       'primary_publishing_organisation', 'body', 'combined_text', 'taxon_id',
       'taxon_base_path', 'taxon_name', 'level1taxon', 'level2taxon',
       'level3taxon', 'level4taxon', 'level5taxon', 'level2taxon_code'],
      dtype='object')

In [13]:
for org in list_org:
    print(org,":",labelled_level2.loc[(labelled_level2['primary_publishing_organisation']==org) & 
                                      (labelled_level2['level3taxon'].isnull())].shape[0])

Valuation Office Agency : 44
HM Revenue & Customs : 663
District Valuer Services (DVS) : 4


In [14]:
labelled_level2['level3taxon'].isnull()

0          True
1          True
2          True
3          True
4          True
5          True
6          True
7          True
8          True
9          True
10         True
11         True
12         True
13         True
14         True
15         True
16         True
17         True
18         True
19         True
20         True
21         True
22         True
23         True
24         True
25         True
26         True
27         True
28         True
29         True
          ...  
172215    False
172216    False
172217    False
172218    False
172219    False
172220    False
172221    False
172222    False
172224    False
172225    False
172226    False
172228    False
172229    False
172230    False
172231    False
172232    False
172233    False
172234    False
172235    False
172236    False
172237    False
172238    False
172282    False
172283    False
172284    False
172285    False
172286    False
172299    False
172306    False
172307    False
Name: level3taxon, Lengt

In [15]:
path = os.path.join(DATADIR, "hm-revenue-customs_and_descendants.txt")

In [16]:
lines = []

In [17]:
total = 0
with open(path,'r') as f:
    for line in f: 
        token_line = line.strip("\n").split(" ")
#         print(token_line)
        for i,token in enumerate(token_line):
            local_count = 0
            if token.isdigit():
#                 print(token_line[i])
                for j in range(i+1,len(token_line)):
#                     print("j",j)
                    if token_line[j] == "":
                        local_count+=1
#                         print("Spaces in:",local_count)
                    else:
#                         print('found word',token_line[j])
                        break
            if local_count == 4:
                something = line.strip("\n").split()
                print("found lvl2",token_line,something[0])
                total+= int(something[0])
                break
total

found lvl2 ['', '', '', '1', '', '', '', '', 'Welfare', 'reform'] 1
found lvl2 ['', '', '38', '', '', '', '', 'Tax', 'credits'] 38
found lvl2 ['', '', '26', '', '', '', '', 'Child', 'Benefit', '(welfare', 'theme)'] 26
found lvl2 ['', '', '', '1', '', '', '', '', 'Arts', 'and', 'culture'] 1
found lvl2 ['', '', '', '2', '', '', '', '', 'Community', 'and', 'society'] 2
found lvl2 ['', '', '', '1', '', '', '', '', 'Sports', 'and', 'leisure'] 1
found lvl2 ['', '', '', '2', '', '', '', '', 'Rural', 'and', 'countryside'] 2
found lvl2 ['', '', '', '0', '', '', '', '', 'Oil', 'and', 'gas'] 0
found lvl2 ['', '', '40', '', '', '', '', 'Climate', 'change', 'and', 'energy'] 40
found lvl2 ['', '', '', '1', '', '', '', '', 'Food', 'and', 'farming'] 1
found lvl2 ['', '', '', '1', '', '', '', '', 'Freight', 'and', 'cargo'] 1
found lvl2 ['', '', '', '1', '', '', '', '', 'Transport', 'security'] 1
found lvl2 ['', '', '', '0', '', '', '', '', 'Driving', 'and', 'road', 'transport'] 0
found lvl2 ['', '', ''

653

In [None]:
clean_content = pd.read_csv(
    os.path.join(RESULTS_DIR, 'clean_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [None]:
clean_content.head()

In [None]:
for org in list_org:
    print(org,":",clean_content.loc[(clean_content['primary_publishing_organisation']==org)].shape[0])

In [None]:
dict_pub_all = clean_content['primary_publishing_organisation'].value_counts()

for key,value in dict_pub.items():
    if isinstance(key,str) and "Adju" in key:
        print(key)

In [None]:
clean_content.columns

In [None]:
content = pd.read_json(
    os.path.join(RESULTS_DIR, 'content.json.gz'),
    compression='gzip',
    orient='table',
    typ='frame',
    dtype=True,
    convert_axes=True,
    convert_dates=True,
    keep_default_dates=True,
    numpy=False,
    precise_float=False,
    date_unit=None
)

In [None]:
content['links'].iloc[0]

In [None]:
def org_mapper(x_links):
    if 'organisations' in x_links:
        return x_links['organisations'][0]['title']
    
def pub_mapper(x_links):
    if 'primary_publishing_organisation' in x_links:
        return x_links['primary_publishing_organisation'][0]['title']
    else:
        return np.NaN

In [None]:
def pub_org_mapper(x_links,option):
    if option in x_links:
        return x_links[option][0]['title']
    else:
        return np.NaN
    
content['organisations2'] = content['links'].map(lambda x: pub_org_mapper(x,"organisations"))

In [None]:
    
content['organisations2'][0:10]

In [None]:
content['organisations'] = content['links'].map(pub_mapper)
content['primary_publishing_organisation'] = content['links'].map(pub_mapper)

In [None]:
dict_organisations = content['organisations'].value_counts()
prim_organisations = content['primary_publishing_organisation'].value_counts()

In [None]:
len(prim_organisations)

In [None]:
for item in list_org:
    print(item,(item in dict_organisations.keys()))

In [None]:
for org in list_org:
    sum_org = 0 
    sum_org = content.loc[content['organisations']==org].shape[0] + content.loc\
                    [content['primary_publishing_organisation']==org].shape[0]
    print(org,":",sum_org)

In [None]:
content['document_type'].loc[(content['primary_publishing_organisation'].isnull()) \
            & (content['organisations'].isnull())].value_counts()

In [None]:
Primary Publishing Organisation

Valuation Office Agency : 374
HM Revenue & Customs : 6645
District Valuer Services (DVS) : 8
    
Organisations

Valuation Office Agency : 411
HM Revenue & Customs : 7416
District Valuer Services (DVS) : 6
    
Valuation Office Agency : 785
HM Revenue & Customs : 14061
District Valuer Services (DVS) : 14