In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

In [2]:
P_THRESHOLD=0.5

### Which data

In [3]:
DATADIR = os.getenv('DATADIR')

In [4]:
print("model v2.0.0 was run on data dated {}".format(DATADIR))

model v2.0.0 was run on data dated /data/2018-03-27


### Get full content to enable total counts

In [5]:
full_content = pd.read_csv(
    os.path.join(DATADIR, 'full_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [6]:
full_content.shape

(210154, 26)

In [7]:
full_content.columns

Index(['base_path', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'email_document_supertype',
       'first_published_at', 'government_document_supertype', 'locale',
       'navigation_document_supertype', 'public_updated_at', 'publishing_app',
       'publishing_scheduled_at', 'scheduled_publishing_delay_seconds',
       'search_user_need_document_supertype', 'title', 'updated_at',
       'user_journey_document_supertype', 'document_type_gp', 'taxons',
       'primary_publishing_organisation', 'body', 'combined_text'],
      dtype='object')

In [8]:
full_content.content_id.nunique()

210154

### Get mappings of taxon2 code to taxon2 string

In [9]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

In [10]:
labelled_level2.shape

(181209, 34)

In [11]:
labelled_level2.taxon_id.nunique()

2690

In [12]:
labelled_level2.content_id.nunique()

130208

### read in predictions & content_id array

#### new

In [13]:
new_pred = pd.read_csv(
    os.path.join(DATADIR, 'new_predictions.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [14]:
new_arrays = np.load(os.path.join(DATADIR, 'new_arrays.npz'))

In [15]:
if new_arrays['content_id'].shape[0] == new_pred.shape[0]:
    new_pred['content_id'] = new_arrays['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")
    
    

In [16]:
new_pred.shape

(68049, 217)

In [17]:
new_pred.columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '208', '209', '210', '211', '212', '213', '214', '215', '216',
       'content_id'],
      dtype='object', length=217)

#### level1

In [18]:
level1_pred = pd.read_csv(
    os.path.join(DATADIR, 'level1_predictions.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [19]:
level1_arrays = np.load(os.path.join(DATADIR, 'level1_arrays.npz'))

In [20]:
if level1_arrays['content_id'].shape[0] == level1_pred.shape[0]:
    level1_pred['content_id'] = level1_arrays['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")
    
    

In [21]:
level1_pred.shape

(18317, 217)

#### Predictions on all unlabelled

In [22]:
unlabelled_pred = pd.concat([new_pred, level1_pred])

In [23]:
unlabelled_pred.shape

(86366, 217)

### Read in and prepare metadata

#### new

In [24]:
new_content = pd.read_csv(
    os.path.join(DATADIR, 'new_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [25]:
new_content.shape

(68049, 28)

In [26]:
new_content.content_id.nunique()

68049

In [27]:
new_content.columns

Index(['base_path', 'body', 'combined_text', 'content_id',
       'content_purpose_document_supertype', 'content_purpose_subgroup',
       'content_purpose_supergroup', 'description', 'details', 'document_type',
       'document_type_gp', 'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'scheduled_publishing_delay_seconds',
       'search_user_need_document_supertype', 'taxon_id', 'taxons', 'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype'],
      dtype='object')

In [28]:
new_content.drop('taxons', axis=1, inplace=True)

In [29]:
new_content.columns

Index(['base_path', 'body', 'combined_text', 'content_id',
       'content_purpose_document_supertype', 'content_purpose_subgroup',
       'content_purpose_supergroup', 'description', 'details', 'document_type',
       'document_type_gp', 'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'scheduled_publishing_delay_seconds',
       'search_user_need_document_supertype', 'taxon_id', 'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype'],
      dtype='object')

In [30]:
new_content['level1taxon'] = ''

#### level1 

In [31]:
level1_content = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level1.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [32]:
level1_content.shape

(18317, 33)

In [33]:
level1_content.content_id.nunique()

17388

In [34]:
level1_content['untagged_type'] = 'level1only'

In [35]:
level1_content.columns

Index(['base_path', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'email_document_supertype',
       'first_published_at', 'government_document_supertype', 'locale',
       'navigation_document_supertype', 'public_updated_at', 'publishing_app',
       'publishing_scheduled_at', 'scheduled_publishing_delay_seconds',
       'search_user_need_document_supertype', 'title', 'updated_at',
       'user_journey_document_supertype', 'document_type_gp',
       'primary_publishing_organisation', 'body', 'combined_text', 'taxon_id',
       'taxon_base_path', 'taxon_name', 'level1taxon', 'level2taxon',
       'level3taxon', 'level4taxon', 'level5taxon', 'untagged_type'],
      dtype='object')

In [36]:
level1_content = level1_content[['base_path', 'body', 'combined_text', 'content_id',
       'content_purpose_document_supertype', 'content_purpose_subgroup',
       'content_purpose_supergroup', 'description', 'details', 'document_type',
       'document_type_gp', 'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'search_user_need_document_supertype', 'taxon_id',  'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype', 'level1taxon']]

In [37]:
content = pd.concat([new_content, level1_content])

In [38]:
content.shape

(86366, 28)

In [39]:
content.content_id.nunique()

85434

In [40]:
content.groupby('untagged_type').size().sort_values()

untagged_type
old_taxons       41
level1only    18317
untagged      68008
dtype: int64

In [41]:
content.groupby('publishing_app').size().sort_values()

publishing_app
frontend                    1
smartanswers                6
manuals-publisher          37
policy-publisher          204
hmrc-manuals-api          220
publisher                 744
specialist-publisher     3144
whitehall               82010
dtype: int64

### Combine predictions with metadata

In [42]:
unlabelled_prob_by_id = pd.melt(unlabelled_pred, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [43]:
unlabelled_prob_by_id .shape

(18655056, 3)

In [44]:
#convert from string to numeric
unlabelled_prob_by_id ['prob'] = pd.to_numeric(unlabelled_prob_by_id ['prob'])
unlabelled_prob_by_id ['taxon2'] = pd.to_numeric(unlabelled_prob_by_id ['taxon2'])

In [45]:
#get the label for the level2 taxon code
unlabelled_prob_by_id ['taxon2label'] = unlabelled_prob_by_id ['taxon2'].map(labels_index)

In [46]:
unlabelled_meta = pd.merge(
    left=unlabelled_prob_by_id ,
    right=content,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [47]:
unlabelled_meta.columns

Index(['content_id', 'taxon2', 'prob', 'taxon2label', 'base_path', 'body',
       'combined_text', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'document_type_gp',
       'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'level1taxon', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'scheduled_publishing_delay_seconds',
       'search_user_need_document_supertype', 'taxon_id', 'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype',
       '_merge'],
      dtype='object')

In [48]:
unlabelled_meta.shape

(19069776, 32)

In [49]:
unlabelled_meta.content_id.nunique()

85434

##### categorical var for probaiblity

In [50]:
unlabelled_meta['prob_cat'] = '>=0.8'
unlabelled_meta.loc[unlabelled_meta['prob']<0.01, 'prob_cat'] = '<0.01'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.01) & (unlabelled_meta['prob']<0.1), 'prob_cat'] = '0.01-0.09'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.1) & (unlabelled_meta['prob']<0.2), 'prob_cat'] = '0.1-0.19'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.2) & (unlabelled_meta['prob']<0.3), 'prob_cat'] = '0.2-0.29'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.3) & (unlabelled_meta['prob']<0.4), 'prob_cat'] = '0.3-0.39'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.4) & (unlabelled_meta['prob']<0.5), 'prob_cat'] = '0.4-0.49'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.5) & (unlabelled_meta['prob']<0.6), 'prob_cat'] = '0.5-0.59'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.6) & (unlabelled_meta['prob']<0.7), 'prob_cat'] = '0.6-0.69'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.7) & (unlabelled_meta['prob']<0.8), 'prob_cat'] = '0.7-0.79'

In [51]:
unlabelled_meta.memory_usage()

Index                                  152558208
content_id                             152558208
taxon2                                 152558208
prob                                   152558208
taxon2label                            152558208
base_path                              152558208
body                                   152558208
combined_text                          152558208
content_purpose_document_supertype     152558208
content_purpose_subgroup               152558208
content_purpose_supergroup             152558208
description                            152558208
details                                152558208
document_type                          152558208
document_type_gp                       152558208
email_document_supertype               152558208
first_published_at                     152558208
government_document_supertype          152558208
level1taxon                            152558208
locale                                 152558208
navigation_document_

In [52]:
memorybytes = unlabelled_meta[['content_id', 'prob', 'taxon2label', 'base_path', 'description', 'document_type', 'document_type_gp',
       'first_published_at',
       'level1taxon', 'locale',
       'primary_publishing_organisation',
       'publishing_app',
       'taxon_id', 'title',
       'untagged_type', 'updated_at', 'prob_cat']].memory_usage().sum()

In [53]:
#1 byte is equal to 9.3132257461548E-10G
print("{}G".format(memorybytes*9.3132257461548E-10))

2.5574562549591104G


In [54]:
unlabelled_meta[['content_id', 'prob', 'taxon2label', 'base_path', 'description', 'document_type', 'document_type_gp',
       'first_published_at',
       'level1taxon', 'locale',
       'primary_publishing_organisation',
       'publishing_app',
       'taxon_id', 'title',
       'untagged_type', 'updated_at', 'prob_cat']].to_csv(os.path.join(DATADIR, 'unlabelled_predictions_meta.csv.gz'),compression='gzip',index=False)