In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

  return f(*args, **kwds)


### Environmental vars

In [2]:
P_THRESHOLD=0.8

In [3]:
DATADIR=os.getenv('DATADIR')
RESULTS_DIR = os.path.join(DATADIR, "2018-03-05")
RESULTS_DIR

'/Users/felisialoukou/Documents/govuk-taxonomy-supervised-learning/data/2018-03-05'

### get some data about taxons/content

In [4]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [5]:
# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [6]:
# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

### Load in data

In [7]:
true_train = pd.read_csv(os.path.join(RESULTS_DIR, 'true_train_1010_0803_.csv.gz'), dtype=float, compression='gzip')
prob_train = pd.read_csv(os.path.join(RESULTS_DIR, 'train_results_1010_0803_.csv.gz')

In [8]:
true_dev = pd.read_csv(os.path.join(RESULTS_DIR, 'true_dev_1010_0803_.csv.gz'), dtype=float, compression='gzip')
prob_dev = pd.read_csv(os.path.join(RESULTS_DIR, 'dev_results_1010_0803_.csv.gz'), dtype=float, compression='gzip')

In [9]:
train = np.load(os.path.join(RESULTS_DIR, 'train_arrays.npz'))

In [10]:
dev = np.load(os.path.join(RESULTS_DIR, 'dev_arrays.npz'))

In [11]:
train.files

['x', 'meta', 'title', 'desc', 'y', 'content_id']

In [12]:
train['content_id']

array(['a79ca5d5-38d3-434e-827b-ec1237906a2f',
       '4a72fcdf-e0b7-42f0-a606-0636f512453a',
       '5c81df9b-7631-11e4-a3cb-005056011aef', ...,
       '5f4f5f13-7631-11e4-a3cb-005056011aef',
       'b3d858bf-9813-4ede-8f09-875d6b6783a8',
       '2bd60f7f-6395-4eb4-aa7d-3c4ef0dd4ef0'], dtype=object)

In [13]:
train['content_id'].shape

(175767,)

In [14]:
true_train.shape[0]

175767

### Merge results array with labelled_level2 values
content_id, metadata etc

In [15]:
df = pd.DataFrame()
if train['content_id'].shape[0] == true_train.shape[0]:
    df['content_id'] = train['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [16]:
df.head()

Unnamed: 0,content_id
0,a79ca5d5-38d3-434e-827b-ec1237906a2f
1,4a72fcdf-e0b7-42f0-a606-0636f512453a
2,5c81df9b-7631-11e4-a3cb-005056011aef
3,735d153c-daf9-49e8-bb52-516d8d5c95f2
4,98ca0ce7-4660-4418-ac37-d1e2d293fd7a


In [17]:
df = pd.concat([df, prob_train], axis=1, join='inner')

In [18]:
df.head()

Unnamed: 0,content_id,1,2,3,4,5,6,7,8,9,...,209,210,211,212,213,214,215,216,217,218
0,a79ca5d5-38d3-434e-827b-ec1237906a2f,9.419133e-14,4.267573e-24,3.404629e-11,9.072372e-09,3.692781e-14,3.947936e-09,5.439439e-15,2.125058e-16,9.309207999999999e-20,...,4.375333e-15,4.126515e-18,7.843087999999999e-20,1.373088e-23,4.3995970000000005e-17,2.8172190000000004e-17,8.471909e-17,2.667753e-13,1.486372e-14,1.0774110000000002e-17
1,4a72fcdf-e0b7-42f0-a606-0636f512453a,5.565166e-21,0.9999936,2.7047380000000002e-33,1.38907e-26,5.1683540000000006e-17,4.638111e-34,3.9995440000000005e-31,1.613292e-19,1.622356e-15,...,9.968751000000001e-27,8.048254e-26,6.07629e-29,6.105007e-14,1.6367940000000003e-17,4.908256e-09,6.357211e-16,1.231617e-15,1.669628e-13,1.076923e-16
2,5c81df9b-7631-11e4-a3cb-005056011aef,1.491981e-15,1.014842e-14,6.670261e-23,9.305821e-12,2.732317e-15,1.720691e-27,3.32643e-24,3.868233e-24,2.157019e-21,...,1.360703e-25,3.046593e-25,1.110729e-12,9.733824e-25,3.677405e-16,6.485454999999999e-19,1.956178e-14,4.184081e-20,3.323064e-12,5.1558409999999997e-20
3,735d153c-daf9-49e8-bb52-516d8d5c95f2,1.798856e-10,2.068046e-15,1.026087e-12,1.400855e-14,8.218687e-16,1.669214e-13,6.421606e-16,1.572653e-09,1.730174e-10,...,0.9706699,3.292226e-09,4.545314e-12,4.475947e-07,1.163922e-13,3.900537e-12,2.358326e-15,3.789344e-12,2.341784e-17,9.243922e-08
4,98ca0ce7-4660-4418-ac37-d1e2d293fd7a,1.2880259999999998e-19,8.072054e-18,3.1774730000000003e-22,3.195644e-12,3.944814e-16,3.2223719999999996e-20,9.837716e-24,1.5219029999999998e-19,3.37429e-18,...,3.0160720000000003e-28,2.1371950000000003e-31,1.39873e-20,8.935608e-23,1.068866e-20,5.27386e-21,8.963249e-18,1.35855e-26,1.100744e-10,2.532474e-12


In [19]:
positive_perfect_meta = pd.merge(
    left=df,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [20]:
positive_perfect_meta.head()

Unnamed: 0,content_id,1,2,3,4,5,6,7,8,9,...,taxon_id,taxon_base_path,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon,level5taxon,level2taxon_code,_merge
0,a79ca5d5-38d3-434e-827b-ec1237906a2f,9.419133e-14,4.267573e-24,3.404629e-11,9.072372e-09,3.692781e-14,3.947936e-09,5.439439e-15,2.125058e-16,9.309207999999999e-20,...,67f50352-bc30-482f-a2d0-a05714e3cea8,/government/cyber-security,Cyber security,Government,Cyber security,,,,47.0,both
1,a79ca5d5-38d3-434e-827b-ec1237906a2f,9.419133e-14,4.267573e-24,3.404629e-11,9.072372e-09,3.692781e-14,3.947936e-09,5.439439e-15,2.125058e-16,9.309207999999999e-20,...,8a98b827-82ad-49b4-819e-82c208c551c4,/government/national-security,National security,Government,National security,,,,130.0,both
2,4a72fcdf-e0b7-42f0-a606-0636f512453a,5.565166e-21,0.9999936,2.7047380000000002e-33,1.38907e-26,5.1683540000000006e-17,4.638111e-34,3.9995440000000005e-31,1.613292e-19,1.622356e-15,...,0a018d28-e3a8-4db7-a593-c8a0bfb57319,/life-circumstances/child-adoption,"Having a child, parenting and adoption",Life circumstances,"Having a child, parenting and adoption",,,,85.0,both
3,4a72fcdf-e0b7-42f0-a606-0636f512453a,5.565166e-21,0.9999936,2.7047380000000002e-33,1.38907e-26,5.1683540000000006e-17,4.638111e-34,3.9995440000000005e-31,1.613292e-19,1.622356e-15,...,f40a63ce-ac0c-4102-84d1-f1835cb7daac,/childcare-parenting/fostering,Fostering,"Parenting, childcare and children's services","Adoption, fostering and surrogacy",Fostering,,,2.0,both
4,5c81df9b-7631-11e4-a3cb-005056011aef,1.491981e-15,1.014842e-14,6.670261e-23,9.305821e-12,2.732317e-15,1.720691e-27,3.32643e-24,3.868233e-24,2.157019e-21,...,668cd623-c7a8-4159-9575-90caac36d4b4,/society-and-culture/community-and-society,Community and society,Society and culture,Community and society,,,,37.0,both


### Evaluate model

#### Training metrics

In [None]:
y_pred = prob_train.copy(deep=True)

In [None]:
y_pred[y_pred>=P_THRESHOLD] = 1
y_pred[y_pred<P_THRESHOLD] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='weighted', sample_weight=None)))

In [None]:
eval_metrics = precision_recall_fscore_support(true_train, y_pred, average=None, sample_weight=None)
eval_metrics_df = pd.DataFrame(list(eval_metrics))

In [None]:
train_metrics = eval_metrics_df.transpose()
train_metrics.columns = ['precision', 'recall', 'f1', 'support']
train_metrics['level2code'] = train_metrics.index +1
train_metrics['level2label'] = train_metrics['level2code'].map(labels_index)

#### Development set metrics

In [None]:
pred_dev = prob_dev.copy()

In [None]:
pred_dev[pred_dev>=P_THRESHOLD] = 1
pred_dev[pred_dev<P_THRESHOLD] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='weighted', sample_weight=None)))

In [None]:
eval_metrics = precision_recall_fscore_support(true_dev, pred_dev, average=None, sample_weight=None)
eval_metrics_df = pd.DataFrame(list(eval_metrics))

In [None]:
dev_metrics = eval_metrics_df.transpose()
dev_metrics.columns = ['precision', 'recall', 'f1', 'support']
dev_metrics['taxon2code'] = dev_metrics.index +1
dev_metrics['taxon2label'] = dev_metrics['taxon2code'].map(labels_index)

In [None]:
dev_metrics[dev_metrics['f1']==0].shape[0]

In [None]:
dev_metrics[dev_metrics['f1']==1].shape[0]

In [None]:
dev_metrics[dev_metrics['f1']==1]

In [None]:
high = dev_metrics[dev_metrics['f1']>0.9]

In [None]:
morethan80 = dev_metrics[dev_metrics['f1']>0.8]

In [None]:
dev_metrics[dev_metrics['f1']<0.6].shape[0]

In [None]:
if dev['content_id'].shape[0] == keep_perfect_performing_taxons.shape[0]:
    keep_perfect_performing_taxons['content_id'] = dev['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

#### Predictions

In [23]:
predictions = pd.read_csv(os.path.join(RESULTS_DIR, 'predictions_meta.csv.gz'), dtype=object, compression='gzip')

In [24]:
predictions.head()

Unnamed: 0,content_id,taxon2,prob,taxon2label,base_path,body,combined_text,content_purpose_document_supertype,content_purpose_subgroup,content_purpose_supergroup,...,public_updated_at,publishing_app,search_user_need_document_supertype,taxon_id,taxons,title,untagged_type,updated_at,user_journey_document_supertype,_merge
0,12bc5028-593d-447f-96af-816ea2553002,1,0.5665937066078186,Administrative justice reform,/government/news/christmas-and-new-year-closur...,crown courts county courts and the royal court...,christmas and new year closures for courts and...,,,,...,2017-11-16T08:50:54.000+00:00,whitehall,government,,,christmas and new year closures for courts and...,untagged,2017-11-16 08:55:37.786,thing,both
1,31abb9f6-2946-4ed9-9b3b-314fdc4a2ae7,1,0.994709849357605,Administrative justice reform,/government/news/first-courts-tribunals-servic...,part of the £1 billion modernisation reforms t...,first courts & tribunals service centres launc...,news,,,...,2017-11-02T12:31:00.000+00:00,whitehall,government,,,first courts & tribunals service centres launched,untagged,2018-01-12 16:14:53.833,thing,both
2,c3631bb5-3cb2-4be4-8d55-1b5560933c0a,1,0.5405107736587524,Administrative justice reform,/government/news/new-chief-executive-for-her-m...,susan will take up the position on 21 november...,new chief executive for her majesty’s courts a...,news,,,...,2016-10-03T15:37:00.000+00:00,whitehall,government,,,new chief executive for her majesty’s courts a...,untagged,2018-01-12 16:28:51.475,thing,both
3,2d6cf199-8bac-4dca-9446-b97f7b5d05e9,1,0.984718143939972,Administrative justice reform,/government/news/civilcrime-news-more-dates-fo...,providers can attend more hmcts roadshow refor...,civil/crime news: more dates for hmcts roadsho...,news,,,...,2018-01-10T15:43:00.000+00:00,whitehall,government,,,civil/crime news: more dates for hmcts roadshows,untagged,2018-01-16 13:33:44.902,thing,both
4,041ed0a2-c9db-4b92-8eb1-b323f95f7e82,1,0.9984933137893676,Administrative justice reform,/government/news/civilcrime-news-digital-updat...,you can now keep up to date with the digital t...,civil/crime news: digital updates available on...,news,,,...,2016-07-18T13:23:31.000+00:00,whitehall,government,,,civil/crime news: digital updates available on...,untagged,2018-01-03 15:52:05.522,thing,both


In [25]:
predictions.columns

Index(['content_id', 'taxon2', 'prob', 'taxon2label', 'base_path', 'body',
       'combined_text', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'document_type_gp',
       'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app',
       'search_user_need_document_supertype', 'taxon_id', 'taxons', 'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype',
       '_merge'],
      dtype='object')

In [26]:
predictions = predictions[['content_id', 'taxon2', 'prob', 'taxon2label', 'base_path', 'body',
       'combined_text', 'description',
       'document_type', 'first_published_at',
       'primary_publishing_organisation',
       'publishing_app',
       'title',
       'untagged_type']]

In [33]:
predictions[200:250].head()

Unnamed: 0,content_id,taxon2,prob,taxon2label,base_path,body,combined_text,content_purpose_document_supertype,content_purpose_subgroup,content_purpose_supergroup,...,public_updated_at,publishing_app,search_user_need_document_supertype,taxon_id,taxons,title,untagged_type,updated_at,user_journey_document_supertype,_merge
200,5c8a731b-7631-11e4-a3cb-005056011aef,3,0.5635973811149597,Afghanistan,/government/news/foreign-secretary-condemns-at...,speaking today the foreign secretary said: “th...,foreign secretary condemns attack on hotel in ...,news,,,...,2011-06-29T00:00:00.000+00:00,whitehall,government,,,foreign secretary condemns attack on hotel in ...,untagged,2018-02-27 09:29:37.488,thing,both
201,5c925da6-7631-11e4-a3cb-005056011aef,3,0.9985857009887696,Afghanistan,/government/news/mercian-soldiers-receive-gall...,2 mercian served in helmand afghanistan for si...,mercian soldiers receive gallantry awards sold...,news,,,...,2010-07-12T00:00:00.000+00:00,whitehall,government,,,mercian soldiers receive gallantry awards,untagged,2018-01-10 12:09:00.768,thing,both
202,5e16f9d2-7631-11e4-a3cb-005056011aef,3,0.98793762922287,Afghanistan,/government/news/pm-delivers-statement-on-afgh...,prime minister david cameron has spoken to the...,pm delivers statement on afghanistan the prime...,,,,...,2010-06-14T00:00:00.000+00:00,whitehall,,,,pm delivers statement on afghanistan,untagged,2017-10-10 11:59:32.223,thing,both
203,5e0c8fd3-7631-11e4-a3cb-005056011aef,3,0.998868465423584,Afghanistan,/government/news/afghanistan-security-handover...,speaking in the house of commons mr cameron sa...,afghanistan security handover entering new pha...,,,,...,2011-07-06T00:00:00.000+00:00,whitehall,,,,afghanistan security handover entering new phase,untagged,2017-10-10 11:58:56.047,thing,both
204,5c90984d-7631-11e4-a3cb-005056011aef,3,0.99999737739563,Afghanistan,/government/news/gibraltar-regiment-soldier-pr...,corporal chris milliken from belfast is one of...,gibraltar regiment soldier provides medical tr...,news,,,...,2012-06-11T00:00:00.000+00:00,whitehall,government,,,gibraltar regiment soldier provides medical tr...,untagged,2018-01-10 12:07:05.791,thing,both


#### F1 scores

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_train, y_pred, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='macro', sample_weight=None)

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='weighted', sample_weight=None)

## LOAD More data

In [None]:
if train['content_id'].shape[0] == true_train.shape[0]:
    true_train['content_id'] = train['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
if dev['content_id'].shape[0] == true_dev.shape[0]:
    true_dev['content_id'] = dev['content_id']
else:
    print("warning: true_dev and content_id may not originate from same data")

## Taxons not represented in training/dev

In [None]:
train_by_id = pd.melt(true_train, id_vars=['content_id'], var_name='taxon2', value_name='tagged')
positive_taxons_true_train = train_by_id[train_by_id['tagged']==1].copy()

In [None]:
dev_by_id = pd.melt(true_dev, id_vars=['content_id'], var_name='taxon2', value_name='tagged')
positive_taxons_true_dev = dev_by_id[dev_by_id['tagged']==1].copy()

In [None]:
positive_taxons_true_train['taxon2label'] = pd.to_numeric(positive_taxons_true_train['taxon2']).map(labels_index)
positive_taxons_true_dev['taxon2label'] = pd.to_numeric(positive_taxons_true_dev['taxon2']).map(labels_index)

In [None]:
positive_taxons_true_train.taxon2.nunique()

In [None]:
positive_taxons_true_dev.taxon2.nunique()

In [None]:
positive_taxons_true_train.taxon2.nunique() - positive_taxons_true_dev.taxon2.nunique()

In [None]:
not_in_dev = np.setdiff1d(positive_taxons_true_train.taxon2label, positive_taxons_true_dev.taxon2label)

np.setdiff1d(positive_taxons_true_train.taxon2label, positive_taxons_true_dev.taxon2label)

In [None]:
positive_taxons_true_train[positive_taxons_true_train['taxon2label'].isin(not_in_dev)].groupby('taxon2label').size().sort_values(ascending=False)

In [None]:
true_train = true_train.drop_duplicates(subset='content_id')

In [None]:
train_true = pd.merge(
    left=true_train,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='1:m'
)

In [None]:
true_train.shape

In [None]:
train_true.groupby('_merge').size().sort_values(ascending=True)

In [None]:
train_true.shape

In [None]:
train_true.level2taxon.nunique()

In [None]:
labelled_level2.level2taxon.nunique()

In [None]:
pred_dev.shape

In [None]:
print(actual_train.shape)
actual_train.head()

In [None]:
print(pred_train.shape)
pred_train.head()

In [None]:
true_train.shape

In [None]:
pred_train.shape

## support and performance

The support is the number of occurrences of each class in y_true

In [None]:
dev_metrics.plot.scatter(x='support', y='f1', marker='o', alpha=.5, figsize=(20, 20))


In [None]:
morethan80.plot.scatter(x='support', y='f1', figsize=(20, 10))

for label, x, y in zip(morethan80['taxon2label'], morethan80['support'], morethan80['f1']):
    plt.annotate(
        label,
        xy=(x, y), xytext=(-1, 1),
        textcoords='offset points', ha='right', va='bottom')
plt.show()

### look at content predicted in taxons with f1=1

In [None]:
prob_dev.head()

In [None]:
dev_metrics[dev_metrics['f1']==1].head()

In [None]:
perfect_performance = dev_metrics[dev_metrics['f1']==1].taxon2code.unique()
perfect_performance = perfect_performance.astype(str)

In [None]:
keep_perfect_performing_taxons = prob_dev[perfect_performance].copy()

In [None]:
keep_perfect_performing_taxons.shape

In [None]:
if dev['content_id'].shape[0] == keep_perfect_performing_taxons.shape[0]:
    keep_perfect_performing_taxons['content_id'] = dev['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
keep_perfect_performing_taxons.head()

In [None]:
# armed forces

keep_perfect_performing_taxons[keep_perfect_performing_taxons['5'] > 0.5].shape

In [None]:
perfect_by_id = pd.melt(keep_perfect_performing_taxons, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [None]:
perfect_by_id.shape

In [None]:
positive_perfect = perfect_by_id[perfect_by_id['prob']>0.5].copy()

In [None]:
positive_perfect.shape

In [None]:
positive_perfect_meta = pd.merge(
    left=positive_perfect,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [None]:
positive_perfect_meta['taxon2label'] = pd.to_numeric(positive_perfect_meta['taxon2']).map(labels_index)

In [None]:
positive_perfect_meta

In [None]:
print("{}% of content has been tagged to taxons which have perfect performance".format(
    positive_perfect.content_id.nunique()/keep_perfect_performing_taxons.shape[0]*100)
)

quick browse but this looks good

###  80 < taxon F1 <1 = good performance!

In [None]:
good_performance = dev_metrics[(dev_metrics['f1']<1) & (dev_metrics['f1']>0.8)].taxon2code.unique().astype(str)

In [None]:
keep_good_performing_taxons = prob_dev[good_performance].copy()

In [None]:
keep_good_performing_taxons.shape

In [None]:
if dev['content_id'].shape[0] == keep_good_performing_taxons.shape[0]:
    keep_good_performing_taxons['content_id'] = dev['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
good_by_id = pd.melt(keep_good_performing_taxons, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [None]:
good_by_id.shape

In [None]:
positive_good = good_by_id[good_by_id['prob']>0.5].copy()

In [None]:
positive_good.shape

In [None]:
positive_good.content_id.nunique()

In [None]:
print("{}% of content has been tagged to taxons which have good performance".format(
    positive_good.content_id.nunique()/keep_good_performing_taxons.shape[0]*100)
)

In [None]:
positive_good_meta = pd.merge(
    left=positive_good,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [None]:
positive_good_meta['taxon2label'] = pd.to_numeric(positive_good_meta['taxon2']).map(labels_index)

In [None]:
positive_good_meta