In [None]:
import numpy as np
import pandas as pd
import os
import re
import yaml
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

In [None]:
DATADIR = os.getenv('DATADIR')
DATADIR

In [3]:
print("model v2.0.0 was run on data dated {}".format(DATADIR))

model v2.0.0 was run on data dated /data/2018-03-27


## 1. Read in data

In [4]:
# BASE_DIR = '../../../govuk-taxonomy-supervised-learning/'
# DATADIR = '../../../govuk-taxonomy-supervised-learning/data'
# DATADIR_NEW = os.path.join(DATADIR, '2018-03-12')
# DATADIR_NEW

In [5]:
unlabelled = pd.read_csv(
    os.path.join(DATADIR, 'unlabelled_predictions_meta.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [6]:
unlabelled_unique_content = unlabelled.drop_duplicates('content_id').copy()

In [7]:
prim_pub_map = pd.read_csv(
    os.path.join(DATADIR, 'bulkmap_pub_org_to_taxon.csv'),
    dtype=object,
    skipfooter=1
)

In [9]:
taxons = pd.read_csv(
    os.path.join(DATADIR, 'clean_taxons.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [10]:
taxons.head()

Unnamed: 0,base_path,content_id,taxon_name,level1,level2tax_id,level3tax_id,level4tax_id,level1taxon,level2taxon,level3taxon,level4taxon,level5taxon
0,/transport/all,a4038b29-b332-4f13-98b1-1c9709e216bc,Transport,,,,,Transport,,,,
1,/defence,e491505c-77ae-45b2-84be-8c94b94f6a2b,Defence,,,,,Defence,,,,
2,/life-circumstances,20086ead-41fc-49cf-8a62-d4e1126f41fc,Life circumstances,,,,,Life circumstances,,,,
3,/money,6acc9db4-780e-4a46-92b4-1812e3c2c48a,Money,,,,,Money,,,,
4,/government/all,e48ab80a-de80-4e83-bf59-26316856a5f9,Government,,,,,Government,,,,


In [11]:
taxon_names = taxons['taxon_name'].unique()

In [12]:
taxon_names.shape

(2471,)

In [14]:
# sort out the fuzzy matching for taxon name
prim_pub_map['taxon_name'] = ''

for taxon_name in taxon_names:
    for index, row in prim_pub_map.iterrows():
        if row['Bulk Tag to']:
            fuzzy = row['Bulk Tag to']
        
            score = fuzz.token_sort_ratio(fuzzy, taxon_name)
        
            if score > 90:
                
                row['taxon_name'] = taxon_name
            

In [15]:
taxon_name_id_dict = dict(zip((taxons['taxon_name']),
                        taxons['content_id']))

In [16]:
prim_pub_map['taxon_id'] = prim_pub_map['taxon_name'].map(taxon_name_id_dict).copy()

In [17]:
# get rid of grand total row
prim_pub_map = prim_pub_map[:-1].copy()

In [18]:
def strip_prim_org(x):
    if x=='':
        return ''
    else:
        return re.sub('\"|}',"",x.split("=>")[1])

In [19]:
prim_pub_map['primary_publishing_organisation'] = prim_pub_map['primary_publishing_organisation'].fillna('').copy()

In [20]:
prim_pub_map['primary_publishing_organisation'] = prim_pub_map['primary_publishing_organisation'].\
                map(strip_prim_org)

In [21]:
prim_pub_map.head()

Unnamed: 0,primary_publishing_organisation,COUNTA of primary_publishing_organisation,Bulk Tag to,Level,taxon_name,taxon_id
0,,0,,,,
1,British Cattle Movement Service,1,,,,
2,Building Regulations Advisory Committee,1,Business regulation,Level 3 (Business),Business regulation,33bc0eed-62c7-4b0b-9a93-626c9e10c025
3,Commonwealth Scholarship Commission in the UK,1,,,,
4,Copyright Tribunal,1,Copyright,Level 3 (Crime Justice and Law),Copyright,7b2a45e2-2f20-4331-a2b6-de512b007a52


In [22]:
pub_taxon_dict = dict(zip((prim_pub_map['primary_publishing_organisation']),
                        prim_pub_map['taxon_name']))

In [23]:
unlabelled_unique_content['taxon2label_bulk'] = unlabelled_unique_content['primary_publishing_organisation'].map(pub_taxon_dict)

In [29]:
unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].isnull()].shape

(15161, 18)

In [30]:
unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].notnull()].shape

(70273, 18)

In [34]:
bulk_tags = unlabelled_unique_content[unlabelled_unique_content['taxon2label_bulk'].notnull()].copy()

In [35]:
bulk_tags['taxon2label'] = bulk_tags['taxon2label_bulk']

In [39]:
bulk_tags['prob'] = np.nan
bulk_tags['prob_cat'] = np.nan

In [40]:
bulk_tags['how_tagged'] = 'bulk_tag'

In [None]:
mapping_path = 'untagged_content - 2. Count of content for each publishing org.csv'
prim_pub_org_mapping = pd.read_csv(os.path.join(DATADIR_NEW,mapping_path),dtype=object)

In [None]:
oos_doc_types_path = "python/config/document_types_excluded_from_the_topic_taxonomy.yml"
with open(os.path.join(BASE_DIR,oos_doc_types_path), 'r') as stream:
    oos_doc_types = yaml.load(stream)['document_types']

In [None]:
oos_doc_types

## Data clean up

In [None]:
filters.head()

In [None]:
bulktag_doc_type = filters['Bulk Taggable Document types to filter out from unlabelled data'].unique()

In [None]:
bulktag_doc_type

In [None]:
prim_pub_org_mapping.drop(prim_pub_org_mapping.index[0],inplace=True)
prim_pub_org_mapping.drop("COUNTA of primary_publishing_organisation",axis=1,inplace=True)

In [None]:
prim_pub_org_mapping.dropna(subset=['Bulk Tag to'],axis=0,inplace=True)

In [None]:
prim_pub_org_mapping['Level'] = prim_pub_org_mapping['Level'].\
                map(lambda x: x if not isinstance(x,float) else "Level 1")

In [None]:
prim_pub_org_mapping['primary_publishing_organisation'] = prim_pub_org_mapping['primary_publishing_organisation'].\
                map(strip_prim_org)

In [None]:
prim_pub_org_mapping.reset_index(drop=True,inplace=True)

In [None]:
prim_pub_org_mapping

In [None]:
filters['Bulktaggable Primary publishing organisations to filter from unlabelled data'] = \
                filters['Bulktaggable Primary publishing organisations to filter from unlabelled data'].map(strip_prim_org)

In [None]:
list_prim_org = prim_pub_org_mapping['primary_publishing_organisation'].values

In [None]:
len(list_prim_org)

## Load in predictions

In [None]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR_NEW, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

In [None]:
del labelled_level2

In [None]:
prediction_path = "new_predictions_1726_1203_.csv.gz"
new_pred = pd.read_csv(os.path.join(DATADIR_NEW, prediction_path), dtype=object, compression='gzip')

In [None]:
new_arrays = np.load(os.path.join(DATADIR_NEW, 'new_arrays.npz'))

In [None]:
print("PREDICTION DATAFRAME",new_pred.shape[0],"NEW ARRAY",len(new_arrays['content_id']))

In [None]:
if new_arrays['content_id'].shape[0] == new_pred.shape[0]:
    new_pred['content_id'] = new_arrays['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
newprob_by_id = pd.melt(new_pred, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [None]:
newprob_by_id.shape

In [None]:
newprob_by_id['prob'] = pd.to_numeric(newprob_by_id['prob'])
newprob_by_id['taxon2'] = pd.to_numeric(newprob_by_id['taxon2'])

In [None]:
newprob_by_id['taxon2label'] = newprob_by_id['taxon2'].map(labels_index)

In [None]:
predictions = newprob_by_id[newprob_by_id['prob'] > 0.5]

In [None]:
predictions_meta = pd.merge(
    left=predictions,
    right=new_content,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [None]:
predictions_meta.loc[predictions_meta['taxon2label']=="Administrative justice reform"]['primary_publishing_organisation'].shape

In [None]:
predictions_meta.to_csv(os.path.join(DATADIR_NEW, 'predictions_meta.csv.gz'),compression='gzip',index=False)

## Compare results.

In [None]:
prim_pub_filter_pred = predictions_meta.loc[predictions_meta['primary_publishing_organisation'].isin(list_prim_org)].copy(deep=True)

In [None]:
prim_pub_filter_pred.shape

In [None]:
prim_pub_org_mapping['Level'].value_counts()

In [None]:
lvl1_org_df = prim_pub_org_mapping.loc[prim_pub_org_mapping['Level']=='Level 1']
lvl2_org_df = prim_pub_org_mapping.loc[prim_pub_org_mapping['Level']=='Level 2']
lvl3_org_df = prim_pub_org_mapping.loc[prim_pub_org_mapping['Level']=='Level 3']

In [None]:
prim_pub_org_mapping.loc[prim_pub_org_mapping['primary_publishing_organisation'] == "HM Courts & Tribunals Service"]

In [None]:
org_taxon_2 = {}
for item in lvl2_org_df.itertuples():
    org_taxon_2 [item[1]] = item[2]

In [None]:
def lvl2mapping(x):
    if x in org_taxon_2:
        return org_taxon_2[x]
    else:
        return np.NaN

In [None]:
prim_pub_filter_pred['org_mapped_taxon2label'] = prim_pub_filter_pred['primary_publishing_organisation'].\
                                map(lvl2mapping)

In [None]:
prim_pub_filter_pred.shape

In [None]:
not_same_taxon_org = prim_pub_filter_pred.loc[prim_pub_filter_pred['org_mapped_taxon2label']!=\
                                prim_pub_filter_pred['taxon2label']].copy(deep=True)

In [None]:
not_same_taxon_org.dropna(subset=['org_mapped_taxon2label'],axis=0, inplace = True)

In [None]:
not_same_taxon_org[['base_path','title','combined_text','primary_publishing_organisation','taxon2label','org_mapped_taxon2label']].\
                        to_csv("taxon_disagreement_level2.csv",index=False)

In [None]:
not_same_taxon_org.loc[not_same_taxon_org['document_type']=="news_story"]['base_path'].values

## 2. Filter by out of scope `document_type`.

In [None]:
oos_doc_type_filtered = new_content.loc[~new_content['document_type'].isin(oos_doc_types)]

## 3. Filter by bulk-taggable `document_type`.

In [None]:
bulk_doc_type = new_content.loc[~new_content['document_type'].isin(bulktag_doc_type)]

## 4. Filter by `primary_publishing_organisation`.

In [None]:
prim_pub_filtered = new_content.loc[~new_content['primary_publishing_organisation'].isin(list_prim_org)]