In [10]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

In [11]:
P_THRESHOLD=0.5

### Which data

In [12]:
DATADIR = os.getenv('DATADIR')

In [13]:
print("model v2.0.0 was run on data dated {}".format(DATADIR))

model v2.0.0 was run on data dated /data/2018-03-22


### Get full content to enable total counts

In [14]:
full_content = pd.read_csv(
    os.path.join(DATADIR, 'full_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [15]:
full_content.shape

(197951, 25)

In [16]:
full_content.columns

Index(['base_path', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'email_document_supertype',
       'first_published_at', 'government_document_supertype', 'locale',
       'navigation_document_supertype', 'public_updated_at', 'publishing_app',
       'publishing_scheduled_at', 'search_user_need_document_supertype',
       'title', 'updated_at', 'user_journey_document_supertype',
       'document_type_gp', 'taxons', 'primary_publishing_organisation', 'body',
       'combined_text'],
      dtype='object')

In [17]:
full_content.content_id.nunique()

197951

### Get mappings of taxon2 code to taxon2 string

In [18]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

In [19]:
labelled_level2.shape

(180391, 33)

In [20]:
labelled_level2.taxon_id.nunique()

2725

In [21]:
labelled_level2.content_id.nunique()

129611

### read in predictions & content_id array

#### new

In [22]:
new_pred = pd.read_csv(
    os.path.join(DATADIR, 'new_predictions_1226_2203_.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [23]:
new_arrays = np.load(os.path.join(DATADIR, 'new_arrays.npz'))

In [24]:
if new_arrays['content_id'].shape[0] == new_pred.shape[0]:
    new_pred['content_id'] = new_arrays['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")
    
    

In [25]:
new_pred.shape

(56568, 218)

In [26]:
new_pred.columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '209', '210', '211', '212', '213', '214', '215', '216', '217',
       'content_id'],
      dtype='object', length=218)

#### level1

In [27]:
level1_pred = pd.read_csv(
    os.path.join(DATADIR, 'level1_predictions_1226_2203_.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [28]:
level1_arrays = np.load(os.path.join(DATADIR, 'level1_arrays.npz'))

In [29]:
if level1_arrays['content_id'].shape[0] == level1_pred.shape[0]:
    level1_pred['content_id'] = level1_arrays['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")
    
    

In [30]:
level1_pred.shape

(17937, 218)

#### Predictions on all unlabelled

In [31]:
unlabelled_pred = pd.concat([new_pred, level1_pred])

In [32]:
unlabelled_pred.shape

(74505, 218)

### Read in and prepare metadata

#### new

In [33]:
new_content = pd.read_csv(
    os.path.join(DATADIR, 'new_content.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [34]:
new_content.shape

(56568, 27)

In [35]:
new_content.content_id.nunique()

56568

In [36]:
new_content.columns

Index(['base_path', 'body', 'combined_text', 'content_id',
       'content_purpose_document_supertype', 'content_purpose_subgroup',
       'content_purpose_supergroup', 'description', 'details', 'document_type',
       'document_type_gp', 'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'search_user_need_document_supertype', 'taxon_id', 'taxons', 'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype'],
      dtype='object')

In [37]:
new_content.drop('taxons', axis=1, inplace=True)

In [38]:
new_content.columns

Index(['base_path', 'body', 'combined_text', 'content_id',
       'content_purpose_document_supertype', 'content_purpose_subgroup',
       'content_purpose_supergroup', 'description', 'details', 'document_type',
       'document_type_gp', 'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'search_user_need_document_supertype', 'taxon_id', 'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype'],
      dtype='object')

In [39]:
new_content['level1taxon'] = ''

#### level1 

In [40]:
level1_content = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level1.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [41]:
level1_content.shape

(17937, 32)

In [42]:
level1_content.content_id.nunique()

17012

In [43]:
level1_content['untagged_type'] = 'level1only'

In [44]:
level1_content.columns

Index(['base_path', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'email_document_supertype',
       'first_published_at', 'government_document_supertype', 'locale',
       'navigation_document_supertype', 'public_updated_at', 'publishing_app',
       'publishing_scheduled_at', 'search_user_need_document_supertype',
       'title', 'updated_at', 'user_journey_document_supertype',
       'document_type_gp', 'primary_publishing_organisation', 'body',
       'combined_text', 'taxon_id', 'taxon_base_path', 'taxon_name',
       'level1taxon', 'level2taxon', 'level3taxon', 'level4taxon',
       'level5taxon', 'untagged_type'],
      dtype='object')

In [45]:
level1_content = level1_content[['base_path', 'body', 'combined_text', 'content_id',
       'content_purpose_document_supertype', 'content_purpose_subgroup',
       'content_purpose_supergroup', 'description', 'details', 'document_type',
       'document_type_gp', 'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'search_user_need_document_supertype', 'taxon_id',  'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype', 'level1taxon']]

In [46]:
content = pd.concat([new_content, level1_content])

In [47]:
content.shape

(74505, 27)

In [48]:
content.content_id.nunique()

73580

In [49]:
content.groupby('untagged_type').size().sort_values()

untagged_type
level1only    17937
untagged      56568
dtype: int64

In [50]:
content.groupby('publishing_app').size().sort_values()

publishing_app
frontend                    1
smartanswers                6
manuals-publisher          36
policy-publisher          204
hmrc-manuals-api          220
publisher                 722
specialist-publisher     2867
whitehall               70449
dtype: int64

### Combine predictions with metadata

In [51]:
unlabelled_prob_by_id = pd.melt(unlabelled_pred, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [52]:
unlabelled_prob_by_id .shape

(16167585, 3)

In [53]:
#convert from string to numeric
unlabelled_prob_by_id ['prob'] = pd.to_numeric(unlabelled_prob_by_id ['prob'])
unlabelled_prob_by_id ['taxon2'] = pd.to_numeric(unlabelled_prob_by_id ['taxon2'])

In [54]:
#get the label for the level2 taxon code
unlabelled_prob_by_id ['taxon2label'] = unlabelled_prob_by_id ['taxon2'].map(labels_index)

In [55]:
unlabelled_meta = pd.merge(
    left=unlabelled_prob_by_id ,
    right=content,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [56]:
unlabelled_meta.columns

Index(['content_id', 'taxon2', 'prob', 'taxon2label', 'base_path', 'body',
       'combined_text', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_type', 'document_type_gp',
       'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale',
       'navigation_document_supertype', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'search_user_need_document_supertype', 'taxon_id', 'title',
       'untagged_type', 'updated_at', 'user_journey_document_supertype',
       'level1taxon', '_merge'],
      dtype='object')

In [57]:
unlabelled_meta.shape

(16579017, 31)

In [58]:
unlabelled_meta.content_id.nunique()

73580

##### categorical var for probaiblity

In [59]:
unlabelled_meta['prob_cat'] = '>=0.8'
unlabelled_meta.loc[unlabelled_meta['prob']<0.01, 'prob_cat'] = '<0.01'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.01) & (unlabelled_meta['prob']<0.1), 'prob_cat'] = '0.01-0.09'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.1) & (unlabelled_meta['prob']<0.2), 'prob_cat'] = '0.1-0.19'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.2) & (unlabelled_meta['prob']<0.3), 'prob_cat'] = '0.2-0.29'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.3) & (unlabelled_meta['prob']<0.4), 'prob_cat'] = '0.3-0.39'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.4) & (unlabelled_meta['prob']<0.5), 'prob_cat'] = '0.4-0.49'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.5) & (unlabelled_meta['prob']<0.6), 'prob_cat'] = '0.5-0.59'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.6) & (unlabelled_meta['prob']<0.7), 'prob_cat'] = '0.6-0.69'
unlabelled_meta.loc[(unlabelled_meta['prob']>=0.7) & (unlabelled_meta['prob']<0.8), 'prob_cat'] = '0.7-0.79'

In [60]:
unlabelled_meta.groupby(['prob_cat']).content_id.nunique()

prob_cat
0.01-0.09    50593
0.1-0.19     27283
0.2-0.29     18070
0.3-0.39     13592
0.4-0.49      9952
0.5-0.59      8180
0.6-0.69      7399
0.7-0.79      7378
<0.01        73576
>=0.8        42580
Name: content_id, dtype: int64

## Analyses
### count content items

In [61]:
unlabelled_meta.groupby('untagged_type')['content_id'].nunique()

untagged_type
level1only    17012
untagged      56568
Name: content_id, dtype: int64

### predict taxon at 50%

In [62]:
predictions = unlabelled_meta[unlabelled_meta['prob'] > 0.5]

In [63]:
print("{} out of {} ({}%) unlabelled content items were tagged at 50%".format(
    predictions.content_id.nunique(),
    unlabelled_meta.content_id.nunique(),
    predictions.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

55579 out of 73580 (75.53547159554228%) unlabelled content items were tagged at 50%


In [64]:
print("There would be a residual {} items that are hard-to-tag".format(
    unlabelled_meta.content_id.nunique() - predictions.content_id.nunique()))

There would be a residual 18001 items that are hard-to-tag


In [65]:
predictions.groupby('untagged_type')['content_id'].nunique()

untagged_type
level1only    12968
untagged      42611
Name: content_id, dtype: int64

In [66]:
print("the total number of items tagged to level 2 or lower would be {}".format(
labelled_level2.content_id.nunique() + predictions.content_id.nunique() ))

the total number of items tagged to level 2 or lower would be 185190


In [67]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + predictions.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + predictions.content_id.nunique())/full_content.content_id.nunique() * 100
))

185190 items would be tagged to level 2 or lower out of 197951 (93.55345514799117%) in-scope items


In [69]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
predictions.content_id.nunique(), 
full_content.content_id.nunique(),
predictions.content_id.nunique()/full_content.content_id.nunique() * 100
))

the ML would have tagged 55579  out of 197951 (28.077150405908537%) in-scope items


In [None]:
full_content.content_id.nunique()

### constrain predictions to well-performng taxons

those with an F1 score of at least 0.8

In [73]:
dev_metrics = pd.read_csv(
    os.path.join(DATADIR, "dev_metrics.csv.gz"),
    dtype=object,
    compression='gzip'
)

In [74]:
dev_metrics['f1'] = pd.to_numeric(dev_metrics['f1'])

In [75]:
good_taxons = dev_metrics[dev_metrics['f1']>=0.8]['taxon2label']

In [76]:
lowrisk_predictions = predictions.loc[predictions['taxon2label'].isin(good_taxons)]

In [77]:
lowrisk_predictions.shape

(45997, 32)

In [78]:
print("{} out of {} ({}%) items tagged at 50% to low risk taxons".format(
      lowrisk_predictions.content_id.nunique(),
      unlabelled_meta.content_id.nunique(),
      lowrisk_predictions.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

38419 out of 73580 (52.213916825224246%) items tagged at 50% to low risk taxons


In [80]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + lowrisk_predictions.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + lowrisk_predictions.content_id.nunique())/full_content.content_id.nunique() * 100
))

168030 items would be tagged to level 2 or lower out of 197951 (84.88464316926915%) in-scope items


In [79]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
lowrisk_predictions.content_id.nunique(), 
full_content.content_id.nunique(),
lowrisk_predictions.content_id.nunique()/full_content.content_id.nunique() * 100
))

the ML would have tagged 38419  out of 197951 (19.408338427186525%) in-scope items


In [81]:
print("There would be a residual {} items that are hard-to-tag".format(
    unlabelled_meta.content_id.nunique() - lowrisk_predictions.content_id.nunique()))

There would be a residual 35161 items that are hard-to-tag


exclude UK economy but keep others, based on content strategists' appraisal of predcitions

In [83]:
no_uk_economy = predictions.loc[predictions['taxon2label'] != 'UK economy']

In [84]:
print("{} out of {} ({}%) items tagged at 50% excluding UK economy".format(
      no_uk_economy.content_id.nunique(),
      unlabelled_meta.content_id.nunique(),
      no_uk_economy.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

53906 out of 73580 (73.2617559119326%) items tagged at 50% excluding UK economy


In [85]:
print("{} out of {} ({}%) items tagged at 50% excluding UK economy".format(
      no_uk_economy.content_id.nunique(),
      unlabelled_meta.content_id.nunique(),
      no_uk_economy.content_id.nunique()/unlabelled_meta.content_id.nunique()*100))

53906 out of 73580 (73.2617559119326%) items tagged at 50% excluding UK economy


In [86]:
print("{} items would be tagged to level 2 or lower out of {} ({}%) in-scope items".format(
labelled_level2.content_id.nunique() + no_uk_economy.content_id.nunique(), 
full_content.content_id.nunique(),
(labelled_level2.content_id.nunique() + no_uk_economy.content_id.nunique())/full_content.content_id.nunique() * 100
))

183517 items would be tagged to level 2 or lower out of 197951 (92.7082964976181%) in-scope items


In [87]:
print("There would be a residual {} items that are hard-to-tag".format(
    unlabelled_meta.content_id.nunique() - no_uk_economy.content_id.nunique()))

There would be a residual 19674 items that are hard-to-tag


In [88]:
print("the ML would have tagged {}  out of {} ({}%) in-scope items".format(
no_uk_economy.content_id.nunique(), 
full_content.content_id.nunique(),
no_uk_economy.content_id.nunique()/full_content.content_id.nunique() * 100
))

the ML would have tagged 53906  out of 197951 (27.23199175553546%) in-scope items


### taxons with no content in dev set so F1=0

In [None]:
unknown_performance = dev_metrics[dev_metrics['f1']==0]['taxon2label']

In [None]:
highrisk_predictions = predictions.loc[predictions['taxon2label'].isin(unknown_performance)]

In [None]:
highrisk_predictions.shape

In [None]:
highrisk_predictions.content_id.nunique()

In [None]:
highrisk_predictions[['taxon2label', 'title', 'description', 'body']]

In [None]:
highrisk_predictions[['taxon2label', 'title', 'description', 'body']].to_csv(os.path.join(DATADIR, 'zerof1predictions.csv.gz'),compression='gzip',index=False)

### taxons with 0.8>F1>0

In [None]:
dev_metrics[(dev_metrics['f1']<0.8) & (dev_metrics['f1']>0)]['taxon2label'].shape

In [None]:
list(dev_metrics[(dev_metrics['f1']<0.8) & (dev_metrics['f1']>0)]['taxon2label'])

In [None]:
list(dev_metrics[dev_metrics['f1']==0]['taxon2label'])

In [None]:
atleast1pct = unlabelled_meta[unlabelled_meta['prob']>=0.01]
atleast1pct.shape

In [None]:
atleast1pct['prob'].hist(range=(0, 1), figsize=(30, 10), bins=500)

In [None]:
atleast1pct['prob'].hist(by=atleast1pct['untagged_type'], range=(0, 1), figsize=(20, 10), bins=500)

In [None]:
#TODO: work out how to standardise ylim to 0, 3000 
atleast1pct['prob'].hist(by=atleast1pct['taxon2label'], range=(0, 1), figsize=(50, 50), bins=50)

In [None]:
78696/101912

In [None]:
0.77*0.35

In [None]:
untagged_predictions = predictions[predictions['untagged_type']=='untagged']
oldtaxons_predictions = predictions[predictions['untagged_type']!='untagged']

In [None]:
untagged_predictions.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
unlabelled_meta['prob'].hist(by=unlabelled_meta['taxon2label'], figsize=(50, 50), bins=50)

In [None]:
labelled_level2.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
oldtaxons_predictions.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
predictions.groupby(['content_id']).size().sort_values(ascending=True).describe()

In [None]:
predictions[['base_path', 'taxon2label', 'title', 'description']][100:150]


In [None]:
predictions[predictions['content_id']=='5c91ea82-7631-11e4-a3cb-005056011aef']


In [None]:
predictions[predictions['content_id']=='5d87c00c-7631-11e4-a3cb-005056011aef']


In [None]:
predictions[['base_path', 'taxon2label', 'title', 'description']].to_csv(os.path.join(DATADIR, 'new_predictions50.csv.gz'),compression='gzip',index=False)

In [None]:
taxons = predictions['taxon2label'].unique()
subsample = pd.DataFrame(columns = predictions.columns)

for taxon in taxons:
    taxon_spec = predictions.loc[predictions['taxon2label']==taxon]
    sample_size = int(round(taxon_spec.shape[0]*0.1))
    if not sample_size == 0 and taxon_spec.shape[0] > sample_size:
        print(taxon,": SAMPLING AT:",sample_size)
        subsample = subsample.append(taxon_spec.sample(n=sample_size), ignore_index=True)

In [None]:
subsample[['base_path', 'taxon2label', 'title', 'description']].to_csv(os.path.join(DATADIR, 'new_predictions_subsample.csv.gz'),compression='gzip',index=False)

### 40% threshold

In [None]:
predictions40 = newprob_meta[newprob_meta['prob'] > 0.4]

predictions40.content_id.nunique()

In [None]:
predictions40.content_id.nunique()/newprob_meta.content_id.nunique()

In [None]:
predictions40.groupby('untagged_type')['content_id'].nunique()

### hard to tag

In [None]:
who_ls

In [None]:
reset_selective -f labelled_level2

In [None]:
reset_selective -f highrisk_predictions

In [None]:
reset_selective -f highrisk_predictions

In [None]:
predictions = predictions.drop(labels='_merge', axis=1).copy()

In [None]:
unlabelled_meta = unlabelled_meta.drop(labels='_merge', axis=1).copy()

In [None]:
no_predictions = pd.merge(
    left=predictions,
    right=unlabelled_meta,
    on='content_id',
    how='outer',
    indicator=True
)

In [None]:
no_predictions.groupby('_merge').size()

In [None]:
no_predictions[no_predictions['document_type_y']=='guidance'].content_id.nunique()

In [None]:
predictions[predictions['document_type']=='guidance'].content_id.nunique()

In [None]:
6029-1033

In [None]:
4996/6029

In [None]:
no_predictions = no_predictions[no_predictions['_merge']=='right_only']

In [None]:
predictions.content_id.nunique()

In [None]:
no_predictions.content_id.nunique() + predictions.content_id.nunique()

In [None]:
no_predictions.content_id.nunique()

In [None]:
newprob_meta.content_id.nunique()

### recency

In [None]:
predictions['first_published_at'] = pd.to_datetime(predictions['first_published_at'])
predictions.index = predictions['first_published_at']

In [None]:
predictions.sort_values(['prob', 'content_id'], ascending=False)

In [None]:
predictions_dedup = predictions.drop_duplicates('content_id')
predicted = predictions_dedup['first_published_at'].resample('Y').count().plot()
predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2017-12-31'))
predicted.set_ylim([0, 30000])

In [None]:
no_predictions['first_published_at_y'] = pd.to_datetime(no_predictions['first_published_at_y'])
no_predictions.index = no_predictions['first_published_at_y']

In [None]:
no_predictions['content_id'].nunique()

In [None]:
no_predictions.sort_values(['prob_y', 'content_id'], ascending=False)

In [None]:
no_predictions_dedup = no_predictions.drop_duplicates('content_id')

In [None]:
no_predictions_dedup.columns

In [None]:
cols =np.arange(1, 29)
cols

In [None]:
hardtotag_guidance = no_predictions[no_predictions['document_type_y']=='guidance'].drop(no_predictions.columns[cols], axis=1 ).sort_values('prob_y')

In [None]:
hardtotag_guidance.drop_duplicates('content_id', inplace=True)

In [None]:
tagged_guidance = predictions[predictions['document_type']=='guidance']

In [None]:
fig = plt.figure(figsize=(15, 7))
ax = hardtotag_guidance['first_published_at_y'].resample('Y').count().plot()
ax.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))


In [None]:
fig = plt.figure(figsize=(15, 7))
ay = tagged_guidance['first_published_at'].resample('Y').count().plot()
ay.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))

In [None]:
hardtotag_guidance[['content_id', 'title_y', 'description_y', 'body_y', 'prob_y', 'taxon2label_y' ]].to_csv(os.path.join(DATADIR, 'hardtotag_guidance.csv.gz'), compression='gzip')

In [None]:
not_predicted = no_predictions_dedup['first_published_at_y'].resample('Y').count().plot()
not_predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2017-12-31'))

In [None]:

fig = plt.figure(figsize=(15, 7))

plt.subplot(2, 1, 1)
plt.title('predictions at 50% threshold')
predicted = predictions_dedup['first_published_at'].resample('Y').count().plot()
predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))
predicted.set_ylim([0, 10000])
predicted.axes.get_xaxis().set_ticklabels([])
predicted.set_xlabel('')

plt.subplot(2, 1, 2)
not_predicted = no_predictions_dedup['first_published_at_y'].resample('Y').count().plot()
not_predicted.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))
plt.title('No predictions above 50%')
plt.ylabel('Count')
not_predicted.set_ylim([0, 10000])






fig.tight_layout()

In [None]:

doctype_grouped = predictions_dedup.groupby(['document_type', pd.Grouper(freq='Y')])['first_published_at'].count()
count_by_year = doctype_grouped.unstack('document_type', fill_value=0)

In [None]:
top_count = count_by_year.loc[:,count_by_year.max() > 500]
ax = top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
lab = count_by_year.stack().reset_index()
lab.columns = ['date', 'document_type', 'percent']
bydoctype_year = lab.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
bydoctype_pcts = bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

bydoctype_pcts.columns = bydoctype_pcts.columns.droplevel(0)

top_pct = bydoctype_pcts[['organisation', 
                          'person', 
                          'placeholder_person', 
                          'regulation', 
                          'world_news_story', 
                          'news_story', 
                          'foi_release',
                          'guidance',
                          'national_statistics',
                          'official_statistics', 
                          'press_release',
                          'transparency'
 ]]

In [None]:
ut = top_pct.plot(kind='area', stacked=True)
ut.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ut.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
doctype_grouped = no_predictions_dedup.groupby(['document_type_y', pd.Grouper(freq='Y')])['first_published_at_y'].count()
count_by_year = doctype_grouped.unstack('document_type_y', fill_value=0)

top_count = count_by_year.loc[:,count_by_year.max() > 300]
ax = top_count.plot()
ax.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
ax = predictions_dedup['first_published_at'].resample('Y').count().plot()
ax.set_xlim(pd.Timestamp('2000-12-31'), pd.Timestamp('2018-03-12'))

In [None]:
lab = count_by_year.stack().reset_index()
lab.columns = ['date', 'document_type', 'percent']
bydoctype_year = lab.groupby(['date', 'document_type']).agg({'percent': 'sum'})
# Change: groupby state_office and divide by sum
bydoctype_pcts = bydoctype_year.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).unstack('document_type', fill_value=0)

bydoctype_pcts.columns = bydoctype_pcts.columns.droplevel(0)

top_pct = bydoctype_pcts[['organisation', 
                          'person', 
                          'placeholder_person', 
                          'regulation', 
                          'world_news_story', 
                          'news_story', 
                          'foi_release',
                          'guidance',
                          'national_statistics',
                          'official_statistics', 
                          'press_release',
                          'transparency'
 ]]



In [None]:
ut = top_pct.plot(kind='area', stacked=True)
ut.set_xlim(pd.Timestamp('2009-12-31'), pd.Timestamp('2018-03-12'))
ut.legend(loc='center left', bbox_to_anchor=(1, 0.5))