In [2]:
import numpy as np
import pandas as pd
import os
import re
import yaml
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

In [3]:
DATADIR = os.getenv('DATADIR')
DATADIR

'/data/2018-03-27'

In [4]:
print("model v2.0.0 was run on data dated {}".format(DATADIR))

model v2.0.0 was run on data dated /data/2018-03-27


## 1. Read in data

In [6]:
unlabelled = pd.read_csv(
    os.path.join(DATADIR, 'unlabelled_predictions_meta.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [7]:
unlabelled_unique_content = unlabelled.drop_duplicates('content_id').copy()

In [31]:
prim_pub_map = pd.read_csv(
    os.path.join(DATADIR, 'bulkmap_pub_org_to_taxon.csv'),
    dtype=object,
    skipfooter=1,
    engine='python'
)

In [9]:
taxons = pd.read_csv(
    os.path.join(DATADIR, 'clean_taxons.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [10]:
taxons.head()

Unnamed: 0,base_path,content_id,taxon_name,level1,level2tax_id,level3tax_id,level4tax_id,level1taxon,level2taxon,level3taxon,level4taxon,level5taxon
0,/transport/all,a4038b29-b332-4f13-98b1-1c9709e216bc,Transport,,,,,Transport,,,,
1,/defence,e491505c-77ae-45b2-84be-8c94b94f6a2b,Defence,,,,,Defence,,,,
2,/life-circumstances,20086ead-41fc-49cf-8a62-d4e1126f41fc,Life circumstances,,,,,Life circumstances,,,,
3,/money,6acc9db4-780e-4a46-92b4-1812e3c2c48a,Money,,,,,Money,,,,
4,/government/all,e48ab80a-de80-4e83-bf59-26316856a5f9,Government,,,,,Government,,,,


In [11]:
taxon_names = taxons['taxon_name'].unique()

In [12]:
taxon_names.shape

(2471,)

In [32]:
# sort out the fuzzy matching for taxon name
prim_pub_map['taxon_name'] = ''

for taxon_name in taxon_names:
    for index, row in prim_pub_map.iterrows():
        if row['Bulk Tag to']:
            fuzzy = row['Bulk Tag to']
        
            score = fuzz.token_sort_ratio(fuzzy, taxon_name)
        
            if score > 90:
                
                row['taxon_name'] = taxon_name
            

In [14]:
taxon_name_id_dict = dict(zip((taxons['taxon_name']),
                        taxons['content_id']))

In [33]:
prim_pub_map['taxon_id'] = prim_pub_map['taxon_name'].map(taxon_name_id_dict).copy()

In [17]:
def strip_prim_org(x):
    if x=='':
        return ''
    else:
        return re.sub('\"|}',"",x.split("=>")[1])

In [34]:
prim_pub_map['primary_publishing_organisation'] = prim_pub_map['primary_publishing_organisation'].fillna('').copy()

In [35]:
prim_pub_map['primary_publishing_organisation'] = prim_pub_map['primary_publishing_organisation'].\
                map(strip_prim_org)

In [36]:
prim_pub_map.head()

Unnamed: 0,primary_publishing_organisation,COUNTA of primary_publishing_organisation,Bulk Tag to,Level,taxon_name,taxon_id
0,,0,,,,
1,British Cattle Movement Service,1,,,,
2,Building Regulations Advisory Committee,1,Business regulation,Level 3 (Business),Business regulation,33bc0eed-62c7-4b0b-9a93-626c9e10c025
3,Commonwealth Scholarship Commission in the UK,1,,,,
4,Copyright Tribunal,1,Copyright,Level 3 (Crime Justice and Law),Copyright,7b2a45e2-2f20-4331-a2b6-de512b007a52


In [37]:
pub_taxon_dict = dict(zip((prim_pub_map['primary_publishing_organisation']),
                        prim_pub_map['taxon_name']))

In [38]:
unlabelled_unique_content['taxon2label_bulk'] = unlabelled_unique_content['primary_publishing_organisation'].map(pub_taxon_dict)

In [39]:
unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].isnull()].shape

(15161, 18)

In [40]:
unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].notnull()].shape

(70273, 18)

In [41]:
bulk_tags = unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].notnull()].copy()

In [42]:
bulk_tags['taxon2label'] = bulk_tags['taxon2label_bulk']

In [43]:
bulk_tags['prob'] = np.nan
bulk_tags['prob_cat'] = np.nan

In [44]:
bulk_tags['how_tagged'] = 'bulk_tag'

In [54]:
bulk_tags.shape

(70273, 19)

### Add to algorithm tags

In [47]:
unlabelled['prob'] = pd.to_numeric(unlabelled['prob'])
predictions = unlabelled[unlabelled['prob'] > 0.5]

In [51]:
no_uk_economy = predictions.loc[predictions['taxon2label'] != 'UK economy'].copy()

In [52]:
no_uk_economy['how_tagged'] = 'algorithm_v2.0.0'

In [55]:
no_uk_economy.shape

(88725, 18)

### concatenate bulk tags to algorithm tags
These will go through rake task to remove superfluous ancesters

In [65]:
bulk_and_algorithm_tags = pd.concat([bulk_tags, no_uk_economy])

In [66]:
bulk_and_algorithm_tags.shape

(158998, 19)

In [67]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags.drop_duplicates(subset=['content_id', 'taxon2label']).copy()

In [68]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags.rename(index=str, columns={"taxon2label": "taxon_tag"})

In [69]:
bulk_and_algorithm_tags.shape

(150404, 19)

### get taxon id

In [74]:
bulk_and_algorithm_tags['taxon_id'] = bulk_and_algorithm_tags['taxon_tag'].map(taxon_name_id_dict)

### remove World tags

In [70]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags.loc[bulk_and_algorithm_tags['taxon_tag'] != 'world_level1'].copy()

In [71]:
bulk_and_algorithm_tags.shape

(150246, 19)

In [77]:
bulk_and_algorithm_tags = bulk_and_algorithm_tags[bulk_and_algorithm_tags['taxon_id'].notnull()].copy()

In [79]:
bulk_and_algorithm_tags[['content_id', 'taxon_tag', 'taxon_id']].shape

(119890, 3)

In [89]:
bulk_and_algorithm_tags[['content_id', 'taxon_tag', 'taxon_id', 'how_tagged']].head()

Unnamed: 0,content_id,taxon_tag,taxon_id,how_tagged
5972,a113dedd-4320-4186-b1af-888437c6aedb,Environment,71d37f3a-7c8c-4128-8763-2fd5b831b9b9,bulk_tag
5973,5d8d7f5f-7631-11e4-a3cb-005056011aef,International aid and development,9fb30a53-70fb-4f1c-878b-0064b202d1ba,bulk_tag
5974,5e2e074d-7631-11e4-a3cb-005056011aef,Health and social care,8124ead8-8ebc-4faf-88ad-dd5cbcc92ba8,bulk_tag
5976,5f50cd4e-7631-11e4-a3cb-005056011aef,Maritime,4a9ab4d7-0d03-4c61-9e16-47787cbf53cd,bulk_tag
5977,b60d9e65-dc51-4a5b-8fbc-b6b15652b6ba,World,91b8ef20-74e7-4552-880c-50e6d73c2ff9,bulk_tag


In [92]:
bulk_and_algorithm_tags[bulk_and_algorithm_tags['how_tagged']=='algorithm_v2.0.0'].head()

Unnamed: 0,base_path,content_id,description,document_type,document_type_gp,first_published_at,how_tagged,level1taxon,locale,primary_publishing_organisation,prob,prob_cat,publishing_app,taxon_tag,taxon2label_bulk,taxon_id,title,untagged_type,updated_at
13121,/government/news/transformation-of-birmingham-...,51f13e1a-ca7c-4b5d-9b0c-123a6e0d9480,after an investment of £8.12m by hmcts birming...,press_release,news_and_announcements,2018-03-01 11:00:18.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,0.961648,>=0.8,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,transformation of birmingham civil justice cen...,untagged,2018-03-12 16:04:23.928
16867,/government/news/hm-courts-tribunals-service-l...,ad7af755-4ae4-4644-a957-a949bf1cccec,hmcts will today (18 january 2018) launch a te...,press_release,news_and_announcements,2018-01-19 10:47:26.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,0.552992,0.5-0.59,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,hm courts & tribunals service launches project...,untagged,2018-03-12 16:09:18.338
27701,/government/news/civilcrime-news-more-dates-fo...,2d6cf199-8bac-4dca-9446-b97f7b5d05e9,new dates announced in hm courts and tribunals...,news_story,news_and_announcements,2018-01-10 15:43:00.000,algorithm_v2.0.0,,en,Legal Aid Agency,0.743616,0.7-0.79,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,civil/crime news: more dates for hmcts roadshows,untagged,2018-01-16 13:33:44.968
47881,/government/publications/appeal-a-judicial-rev...,04ff3b09-ce00-48a0-aa05-7c99e03ddfdc,use this index when submitting a 'core bundle'...,form,service,2016-08-31 23:00:00.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,0.831607,>=0.8,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,appeal a judicial review from the administrati...,untagged,2018-03-21 15:11:25.946
48071,/government/publications/appeal-an-administrat...,cd6e19e9-f652-49be-b665-43aaeab48cb7,use this index when submitting a 'core bundle'...,form,service,2016-08-31 23:00:00.000,algorithm_v2.0.0,,en,HM Courts & Tribunals Service,0.604943,0.6-0.69,whitehall,Administrative justice reform,,66f7bd95-7e8b-4f33-ad7f-77a13ae92d49,appeal an administrative judicial review or fr...,untagged,2018-03-21 15:11:46.329


In [90]:
bulk_and_algorithm_tags.groupby(['how_tagged']).size()

how_tagged
algorithm_v2.0.0    79973
bulk_tag            39917
dtype: int64

In [93]:
bulk_and_algorithm_tags[['content_id', 'taxon_tag', 'taxon_id', 'how_tagged']].to_csv(os.path.join(DATADIR, 'bulk_and_algorithm_tags.csv.gz'),compression='gzip',index=False)