In [None]:
import numpy as np
import pandas as pd
import os
import re
import yaml

In [None]:
DATADIR = os.getenv('DATADIR')
DATADIR

## 1. Read in data

In [None]:
BASE_DIR = '../../../govuk-taxonomy-supervised-learning/'
DATADIR = '../../../govuk-taxonomy-supervised-learning/data'
DATADIR_NEW = os.path.join(DATADIR, '2018-03-12')
DATADIR_NEW

In [None]:
new_content = pd.read_csv(
    os.path.join(DATADIR_NEW, 'new_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [None]:
filters = pd.read_csv(
    os.path.join(DATADIR_NEW, 'untagged_content - 3. Filters.csv'),
    dtype=object
)

In [None]:
mapping_path = 'untagged_content - 2. Count of content for each publishing org.csv'
prim_pub_org_mapping = pd.read_csv(os.path.join(DATADIR_NEW,mapping_path),dtype=object)

In [None]:
oos_doc_types_path = "python/config/document_types_excluded_from_the_topic_taxonomy.yml"
with open(os.path.join(BASE_DIR,oos_doc_types_path), 'r') as stream:
    oos_doc_types = yaml.load(stream)['document_types']

In [None]:
oos_doc_types

## Data clean up

In [None]:
def strip_prim_org(x):
    return re.sub('\"|}',"",x.split("=>")[1])

In [None]:
filters.head()

In [None]:
bulktag_doc_type = filters['Bulk Taggable Document types to filter out from unlabelled data'].unique()

In [None]:
bulktag_doc_type

In [None]:
prim_pub_org_mapping.drop(prim_pub_org_mapping.index[0],inplace=True)
prim_pub_org_mapping.drop("COUNTA of primary_publishing_organisation",axis=1,inplace=True)

In [None]:
prim_pub_org_mapping.dropna(subset=['Bulk Tag to'],axis=0,inplace=True)

In [None]:
prim_pub_org_mapping['Level'] = prim_pub_org_mapping['Level'].\
                map(lambda x: x if not isinstance(x,float) else "Level 1")

In [None]:
prim_pub_org_mapping['primary_publishing_organisation'] = prim_pub_org_mapping['primary_publishing_organisation'].\
                map(strip_prim_org)

In [None]:
prim_pub_org_mapping.reset_index(drop=True,inplace=True)

In [None]:
prim_pub_org_mapping

In [None]:
filters['Bulktaggable Primary publishing organisations to filter from unlabelled data'] = \
                filters['Bulktaggable Primary publishing organisations to filter from unlabelled data'].map(strip_prim_org)

In [None]:
list_prim_org = prim_pub_org_mapping['primary_publishing_organisation'].values

In [None]:
len(list_prim_org)

## Load in predictions

In [None]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR_NEW, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

In [None]:
del labelled_level2

In [None]:
prediction_path = "new_predictions_1726_1203_.csv.gz"
new_pred = pd.read_csv(os.path.join(DATADIR_NEW, prediction_path), dtype=object, compression='gzip')

In [None]:
new_arrays = np.load(os.path.join(DATADIR_NEW, 'new_arrays.npz'))

In [None]:
print("PREDICTION DATAFRAME",new_pred.shape[0],"NEW ARRAY",len(new_arrays['content_id']))

In [None]:
if new_arrays['content_id'].shape[0] == new_pred.shape[0]:
    new_pred['content_id'] = new_arrays['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
newprob_by_id = pd.melt(new_pred, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [None]:
newprob_by_id.shape

In [None]:
newprob_by_id['prob'] = pd.to_numeric(newprob_by_id['prob'])
newprob_by_id['taxon2'] = pd.to_numeric(newprob_by_id['taxon2'])

In [None]:
newprob_by_id['taxon2label'] = newprob_by_id['taxon2'].map(labels_index)

In [None]:
predictions = newprob_by_id[newprob_by_id['prob'] > 0.5]

In [None]:
predictions_meta = pd.merge(
    left=predictions,
    right=new_content,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [None]:
predictions_meta.loc[predictions_meta['taxon2label']=="Administrative justice reform"]['primary_publishing_organisation'].shape

In [None]:
predictions_meta.to_csv(os.path.join(DATADIR_NEW, 'predictions_meta.csv.gz'),compression='gzip',index=False)

## Compare results.

In [None]:
prim_pub_filter_pred = predictions_meta.loc[predictions_meta['primary_publishing_organisation'].isin(list_prim_org)].copy(deep=True)

In [None]:
prim_pub_filter_pred.shape

In [None]:
prim_pub_org_mapping['Level'].value_counts()

In [None]:
lvl1_org_df = prim_pub_org_mapping.loc[prim_pub_org_mapping['Level']=='Level 1']
lvl2_org_df = prim_pub_org_mapping.loc[prim_pub_org_mapping['Level']=='Level 2']
lvl3_org_df = prim_pub_org_mapping.loc[prim_pub_org_mapping['Level']=='Level 3']

In [None]:
prim_pub_org_mapping.loc[prim_pub_org_mapping['primary_publishing_organisation'] == "HM Courts & Tribunals Service"]

In [None]:
org_taxon_2 = {}
for item in lvl2_org_df.itertuples():
    org_taxon_2 [item[1]] = item[2]

In [None]:
def lvl2mapping(x):
    if x in org_taxon_2:
        return org_taxon_2[x]
    else:
        return np.NaN

In [None]:
prim_pub_filter_pred['org_mapped_taxon2label'] = prim_pub_filter_pred['primary_publishing_organisation'].\
                                map(lvl2mapping)

In [None]:
prim_pub_filter_pred.shape

In [None]:
not_same_taxon_org = prim_pub_filter_pred.loc[prim_pub_filter_pred['org_mapped_taxon2label']!=\
                                prim_pub_filter_pred['taxon2label']].copy(deep=True)

In [None]:
not_same_taxon_org.dropna(subset=['org_mapped_taxon2label'],axis=0, inplace = True)

In [None]:
not_same_taxon_org[['base_path','title','combined_text','primary_publishing_organisation','taxon2label','org_mapped_taxon2label']].\
                        to_csv("taxon_disagreement_level2.csv",index=False)

In [None]:
not_same_taxon_org.loc[not_same_taxon_org['document_type']=="news_story"]['base_path'].values

## 2. Filter by out of scope `document_type`.

In [None]:
oos_doc_type_filtered = new_content.loc[~new_content['document_type'].isin(oos_doc_types)]

## 3. Filter by bulk-taggable `document_type`.

In [None]:
bulk_doc_type = new_content.loc[~new_content['document_type'].isin(bulktag_doc_type)]

## 4. Filter by `primary_publishing_organisation`.

In [None]:
prim_pub_filtered = new_content.loc[~new_content['primary_publishing_organisation'].isin(list_prim_org)]