In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

In [None]:
P_THRESHOLD=0.5

### Which data

In [None]:
DATADIR = os.getenv('DATADIR')

In [None]:
print("model v2.0.0 was run on data dated {}".format(DATADIR))

### Get full content to enable total counts

In [None]:
full_content = pd.read_csv(
    os.path.join(DATADIR, 'full_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [None]:
full_content.shape

In [None]:
full_content.columns

In [None]:
full_content.content_id.nunique()

In [None]:
unlabelled_meta = pd.read_csv(
    os.path.join(DATADIR, 'unlabelled_predictions_meta.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [None]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

labelled_level2.shape

In [None]:
labelled_level2.taxon_id.nunique()

In [None]:
labelled_level2.content_id.nunique()

## Analyses
### count content items

In [None]:
unlabelled_meta.groupby(['prob_cat']).content_id.nunique()

In [None]:
unlabelled_meta.groupby('untagged_type')['content_id'].nunique()

### predict taxon at 50%

In [None]:
predictions = unlabelled_meta[unlabelled_meta['prob'] > 0.5]

In [None]:
print("{} out of {} ({}%) unlabelled content items were tagged at 50%".format(
    predictions.content_id.nunique(),
    unlabelled_meta.content_id.nunique(),
    predictions.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

In [None]:
print("There would be a residual {} items that are hard-to-tag".format(
    unlabelled_meta.content_id.nunique() - predictions.content_id.nunique()))

In [None]:
predictions.groupby('untagged_type')['content_id'].nunique()

In [None]:
print("the total number of items tagged to level 2 or lower would be {}".format(
labelled_level2.content_id.nunique() + predictions.content_id.nunique() ))

In [None]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + predictions.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + predictions.content_id.nunique())/full_content.content_id.nunique() * 100
))

In [None]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
predictions.content_id.nunique(), 
full_content.content_id.nunique(),
predictions.content_id.nunique()/full_content.content_id.nunique() * 100
))

In [None]:
full_content.content_id.nunique()

### constrain predictions to well-performng taxons

those with an F1 score of at least 0.8

In [None]:
dev_metrics = pd.read_csv(
    os.path.join(DATADIR, "dev_metrics.csv.gz"),
    dtype=object,
    compression='gzip'
)

In [None]:
dev_metrics['f1'] = pd.to_numeric(dev_metrics['f1'])

In [None]:
good_taxons = dev_metrics[dev_metrics['f1']>=0.8]['taxon2label']

In [None]:
lowrisk_predictions = predictions.loc[predictions['taxon2label'].isin(good_taxons)]

In [None]:
lowrisk_predictions.shape

In [None]:
print("{} out of {} ({}%) items tagged at 50% to low risk taxons".format(
      lowrisk_predictions.content_id.nunique(),
      unlabelled_meta.content_id.nunique(),
      lowrisk_predictions.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

In [None]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + lowrisk_predictions.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + lowrisk_predictions.content_id.nunique())/full_content.content_id.nunique() * 100
))

In [None]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
lowrisk_predictions.content_id.nunique(), 
full_content.content_id.nunique(),
lowrisk_predictions.content_id.nunique()/full_content.content_id.nunique() * 100
))

In [None]:
print("There would be a residual {} items that are hard-to-tag".format(
    unlabelled_meta.content_id.nunique() - lowrisk_predictions.content_id.nunique()))

### UK economy only

exclude UK economy but keep others, based on content strategists' appraisal of predcitions

In [None]:
no_uk_economy = predictions.loc[predictions['taxon2label'] != 'UK economy']

In [None]:
print("{} out of {} ({}%) items tagged at 50% excluding UK economy".format(
      no_uk_economy.content_id.nunique(),
      unlabelled_meta.content_id.nunique(),
      no_uk_economy.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

In [None]:
print("{} out of {} ({}%) items tagged at 50% excluding UK economy".format(
      no_uk_economy.content_id.nunique(),
      unlabelled_meta.content_id.nunique(),
      no_uk_economy.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

In [None]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + no_uk_economy.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + no_uk_economy.content_id.nunique())/full_content.content_id.nunique() * 100
))

In [None]:
print("There would be a residual {} items that are hard-to-tag".format(
    unlabelled_meta.content_id.nunique() - no_uk_economy.content_id.nunique()))

In [None]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
no_uk_economy.content_id.nunique(), 
full_content.content_id.nunique(),
no_uk_economy.content_id.nunique()/full_content.content_id.nunique() * 100
))

### hard to tag (UK economy)

In [None]:
no_uk_economy = no_uk_economy.drop(labels='_merge', axis=1).copy()

In [None]:
no_uk_economy.columns

In [None]:
unlabelled_id = unlabelled_meta[['content_id', 'base_path']].copy()

In [None]:
hard_to_tag = pd.merge(
    left=no_uk_economy,
    right=unlabelled_id,
    on='content_id',
    how='outer',
    indicator=True
)

In [None]:
hard_to_tag.groupby('_merge').size()

In [None]:
hard_to_tag[hard_to_tag['document_type']=='guidance'].content_id.nunique()

In [None]:
no_uk_economy[no_uk_economy['document_type']=='guidance'].content_id.nunique()

In [None]:
hard_to_tag = hard_to_tag[hard_to_tag['_merge']=='right_only']

In [None]:
hard_to_tag.to_csv(os.path.join(DATADIR, 'hard_to_tag_ukeconomy.csv.gz'),compression='gzip',index=False)

In [None]:
predictions.content_id.nunique()

In [None]:
no_predictions.content_id.nunique() + predictions.content_id.nunique()

In [None]:
no_predictions.content_id.nunique()

In [None]:
newprob_meta.content_id.nunique()

In [None]:
hardtotag_guidance = no_predictions[no_predictions['document_type_y']=='guidance'].drop(no_predictions.columns[cols], axis=1 ).sort_values('prob_y')

### taxons with no content in dev set so F1=0

In [None]:
unknown_performance = dev_metrics[dev_metrics['f1']==0]['taxon2label']

In [None]:
highrisk_predictions = predictions.loc[predictions['taxon2label'].isin(unknown_performance)]

In [None]:
highrisk_predictions.shape

In [None]:
highrisk_predictions.content_id.nunique()

In [None]:
highrisk_predictions[['taxon2label', 'title', 'description', 'body']]

In [None]:
highrisk_predictions[['taxon2label', 'title', 'description', 'body']].to_csv(os.path.join(DATADIR, 'zerof1predictions.csv.gz'),compression='gzip',index=False)

### taxons with 0.8>F1>0

In [None]:
dev_metrics[(dev_metrics['f1']<0.8) & (dev_metrics['f1']>0)]['taxon2label'].shape

In [None]:
list(dev_metrics[(dev_metrics['f1']<0.8) & (dev_metrics['f1']>0)]['taxon2label'])

In [None]:
list(dev_metrics[dev_metrics['f1']==0]['taxon2label'])

In [None]:
atleast1pct = unlabelled_meta[unlabelled_meta['prob']>=0.01]
atleast1pct.shape

In [None]:
atleast1pct['prob'].hist(range=(0, 1), figsize=(30, 10), bins=500)

In [None]:
atleast1pct['prob'].hist(by=atleast1pct['untagged_type'], range=(0, 1), figsize=(20, 10), bins=500)

In [None]:
#TODO: work out how to standardise ylim to 0, 3000 
atleast1pct['prob'].hist(by=atleast1pct['taxon2label'], range=(0, 1), figsize=(50, 50), bins=50)

In [None]:
untagged_predictions = predictions[predictions['untagged_type']=='untagged']
oldtaxons_predictions = predictions[predictions['untagged_type']!='untagged']

In [None]:
untagged_predictions.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
unlabelled_meta['prob'].hist(by=unlabelled_meta['taxon2label'], figsize=(50, 50), bins=50)

In [None]:
labelled_level2.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
oldtaxons_predictions.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
predictions.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
predictions[['base_path', 'taxon2label', 'title', 'description']][100:150]


In [None]:
predictions[predictions['content_id']=='5c91ea82-7631-11e4-a3cb-005056011aef']


In [None]:
predictions[predictions['content_id']=='5d87c00c-7631-11e4-a3cb-005056011aef']


In [None]:
predictions[['base_path', 'taxon2label', 'title', 'description']].to_csv(os.path.join(DATADIR, 'new_predictions50.csv.gz'),compression='gzip',index=False)

In [None]:
taxons = predictions['taxon2label'].unique()
subsample = pd.DataFrame(columns = predictions.columns)

for taxon in taxons:
    taxon_spec = predictions.loc[predictions['taxon2label']==taxon]
    sample_size = int(round(taxon_spec.shape[0]*0.1))
    if not sample_size == 0 and taxon_spec.shape[0] > sample_size:
        print(taxon,": SAMPLING AT:",sample_size)
        subsample = subsample.append(taxon_spec.sample(n=sample_size), ignore_index=True)

In [None]:
subsample[['base_path', 'taxon2label', 'title', 'description']].to_csv(os.path.join(DATADIR, 'new_predictions_subsample.csv.gz'),compression='gzip',index=False)

### 40% threshold

In [None]:
predictions40 = newprob_meta[newprob_meta['prob'] > 0.4]

predictions40.content_id.nunique()

In [None]:
predictions40.content_id.nunique()/newprob_meta.content_id.nunique()

In [None]:
predictions40.groupby('untagged_type')['content_id'].nunique()

In [None]:
who_ls

In [None]:
reset_selective -f labelled_level2

In [None]:
reset_selective -f highrisk_predictions

In [None]:
reset_selective -f highrisk_predictions

### recency

In [None]:
predictions['first_published_at'] = pd.to_datetime(predictions['first_published_at'])
predictions.index = predictions['first_published_at']

In [None]:
predictions.sort_values(['prob', 'content_id'], ascending=False)

In [None]:
predictions_dedup = predictions.drop_duplicates('content_id')
predicted = predictions_dedup['first_published_at'].resample('Y').count().plot()
predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2017-12-31'))
predicted.set_ylim([0, 30000])

In [None]:
no_predictions['first_published_at_y'] = pd.to_datetime(no_predictions['first_published_at_y'])
no_predictions.index = no_predictions['first_published_at_y']

In [None]:
no_predictions['content_id'].nunique()

In [None]:
no_predictions.sort_values(['prob_y', 'content_id'], ascending=False)

In [None]:
no_predictions_dedup = no_predictions.drop_duplicates('content_id')

In [None]:
no_predictions_dedup.columns

In [None]:
cols =np.arange(1, 29)
cols

In [None]:
hardtotag_guidance = no_predictions[no_predictions['document_type_y']=='guidance'].drop(no_predictions.columns[cols], axis=1 ).sort_values('prob_y')

In [None]:
hardtotag_guidance.drop_duplicates('content_id', inplace=True)

In [None]:
tagged_guidance = predictions[predictions['document_type']=='guidance']

In [None]:
fig = plt.figure(figsize=(15, 7))
ax = hardtotag_guidance['first_published_at_y'].resample('Y').count().plot()
ax.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))


In [None]:
fig = plt.figure(figsize=(15, 7))
ay = tagged_guidance['first_published_at'].resample('Y').count().plot()
ay.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))

In [None]:
hardtotag_guidance[['content_id', 'title_y', 'description_y', 'body_y', 'prob_y', 'taxon2label_y' ]].to_csv(os.path.join(DATADIR, 'hardtotag_guidance.csv.gz'), compression='gzip')

In [None]:
not_predicted = no_predictions_dedup['first_published_at_y'].resample('Y').count().plot()
not_predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2017-12-31'))

In [None]:

fig = plt.figure(figsize=(15, 7))

plt.subplot(2, 1, 1)
plt.title('predictions at 50% threshold')
predicted = predictions_dedup['first_published_at'].resample('Y').count().plot()
predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))
predicted.set_ylim([0, 10000])
predicted.axes.get_xaxis().set_ticklabels([])
predicted.set_xlabel('')

plt.subplot(2, 1, 2)
not_predicted = no_predictions_dedup['first_published_at_y'].resample('Y').count().plot()
not_predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))
plt.title('No predictions above 50%')
plt.ylabel('Count')
not_predicted.set_ylim([0, 10000])






fig.tight_layout()

In [None]:

doctype_grouped = predictions_dedup.groupby(['document_type', pd.Grouper(freq='Y')])['first_published_at'].count()
count_by_year = doctype_grouped.unstack('document_type', fill_value=0)

In [None]:
top_count = count_by_year.loc[:,count_by_year.max() > 500]
ax = top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
lab = count_by_year.stack().reset_index()
lab.columns = ['date', 'document_type', 'percent']
bydoctype_year = lab.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
bydoctype_pcts = bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

bydoctype_pcts.columns = bydoctype_pcts.columns.droplevel(0)

top_pct = bydoctype_pcts[['organisation', 
                          'person', 
                          'placeholder_person', 
                          'regulation', 
                          'world_news_story', 
                          'news_story', 
                          'foi_release',
                          'guidance',
                          'national_statistics',
                          'official_statistics', 
                          'press_release',
                          'transparency'
 ]]

In [None]:
ut = top_pct.plot(kind='area', stacked=True)
ut.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ut.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
doctype_grouped = no_predictions_dedup.groupby(['document_type_y', pd.Grouper(freq='Y')])['first_published_at_y'].count()
count_by_year = doctype_grouped.unstack('document_type_y', fill_value=0)

top_count = count_by_year.loc[:,count_by_year.max() > 300]
ax = top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
ax = predictions_dedup['first_published_at'].resample('Y').count().plot()
ax.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))

In [None]:
lab = count_by_year.stack().reset_index()
lab.columns = ['date', 'document_type', 'percent']
bydoctype_year = lab.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
bydoctype_pcts = bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

bydoctype_pcts.columns = bydoctype_pcts.columns.droplevel(0)

top_pct = bydoctype_pcts[['organisation', 
                          'person', 
                          'placeholder_person', 
                          'regulation', 
                          'world_news_story', 
                          'news_story', 
                          'foi_release',
                          'guidance',
                          'national_statistics',
                          'official_statistics', 
                          'press_release',
                          'transparency'
 ]]



In [None]:
ut = top_pct.plot(kind='area', stacked=True)
ut.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ut.legend(loc='center left', bbox_to_anchor=(1, 0.5))