In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

### Environmental vars

In [2]:
P_THRESHOLD=0.8

In [3]:
DATADIR=os.getenv('DATADIR')

### get some data about taxons/content

In [4]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [5]:
# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [6]:
# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

### Load in data

In [7]:
true_train = pd.read_csv(os.path.join('/data', 'true_train_1149_0603_.csv.gz'), dtype=float, compression='gzip')
pred_train = pd.read_csv(os.path.join(DATADIR, 'train_results_1149_0603_.csv.gz'), dtype=float, compression='gzip')

FileNotFoundError: [Errno 2] No such file or directory: '/data/2018-03-05/true_train_1149_0603_.csv.gz'

In [None]:
true_dev = pd.read_csv(os.path.join(DATADIR, 'true_dev_1149_0603_.csv.gz'), dtype=float, compression='gzip')
prob_dev = pd.read_csv(os.path.join(DATADIR, 'dev_results_1149_0603_.csv.gz'), dtype=float, compression='gzip')

In [None]:
train = np.load(os.path.join(DATADIR, 'train_arrays.npz'))

In [None]:
dev = np.load(os.path.join(DATADIR, 'dev_arrays.npz'))

In [None]:
train.files

In [None]:
train['content_id']

In [None]:
train['content_id'].shape

In [None]:
true_train.shape[0]

### Evaluate model

#### Training metrics

In [None]:
y_pred = pred_train.copy()

In [None]:
y_pred[y_pred>=P_THRESHOLD] = 1
y_pred[y_pred<P_THRESHOLD] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='weighted', sample_weight=None)))

In [None]:
eval_metrics = precision_recall_fscore_support(true_train, y_pred, average=None, sample_weight=None)
eval_metrics_df = pd.DataFrame(list(eval_metrics))

In [None]:
train_metrics = eval_metrics_df.transpose()
train_metrics.columns = ['precision', 'recall', 'f1', 'support']
train_metrics['level2code'] = train_metrics.index +1
train_metrics['level2label'] = train_metrics['level2code'].map(labels_index)

#### Development set metrics

In [None]:
pred_dev = prob_dev.copy()

In [None]:
pred_dev[pred_dev>=P_THRESHOLD] = 1
pred_dev[pred_dev<P_THRESHOLD] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='weighted', sample_weight=None)))

In [None]:
eval_metrics = precision_recall_fscore_support(true_dev, pred_dev, average=None, sample_weight=None)
eval_metrics_df = pd.DataFrame(list(eval_metrics))

In [None]:
dev_metrics = eval_metrics_df.transpose()
dev_metrics.columns = ['precision', 'recall', 'f1', 'support']
dev_metrics['taxon2code'] = dev_metrics.index +1
dev_metrics['taxon2label'] = dev_metrics['taxon2code'].map(labels_index)

In [None]:
dev_metrics[dev_metrics['f1']==0].shape[0]

In [None]:
dev_metrics[dev_metrics['f1']==1].shape[0]

In [None]:
dev_metrics[dev_metrics['f1']==1]

In [None]:
high = dev_metrics[dev_metrics['f1']>0.9]

In [None]:
morethan80 = dev_metrics[dev_metrics['f1']>0.8]

In [None]:
dev_metrics[dev_metrics['f1']<0.6].shape[0]

## support and performance

The support is the number of occurrences of each class in y_true

In [None]:
dev_metrics.plot.scatter(x='support', y='f1', marker='o', alpha=.5, figsize=(20, 20))


In [None]:
morethan80.plot.scatter(x='support', y='f1', figsize=(20, 10))

for label, x, y in zip(morethan80['taxon2label'], morethan80['support'], morethan80['f1']):
    plt.annotate(
        label,
        xy=(x, y), xytext=(-1, 1),
        textcoords='offset points', ha='right', va='bottom')
plt.show()

### look at content predicted in taxons with f1=1

In [None]:
prob_dev.head()

In [None]:
dev_metrics[dev_metrics['f1']==1].head()

In [None]:
perfect_performance = dev_metrics[dev_metrics['f1']==1].taxon2code.unique()
perfect_performance = perfect_performance.astype(str)

In [None]:
keep_perfect_performing_taxons = prob_dev[perfect_performance].copy()

In [None]:
keep_perfect_performing_taxons.shape

In [None]:
if dev['content_id'].shape[0] == keep_perfect_performing_taxons.shape[0]:
    keep_perfect_performing_taxons['content_id'] = dev['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
keep_perfect_performing_taxons.head()

In [None]:
# armed forces

keep_perfect_performing_taxons[keep_perfect_performing_taxons['5'] > 0.5].shape

In [None]:
perfect_by_id = pd.melt(keep_perfect_performing_taxons, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [None]:
perfect_by_id.shape

In [None]:
positive_perfect = perfect_by_id[perfect_by_id['prob']>0.5].copy()

In [None]:
positive_perfect.shape

In [None]:
positive_perfect_meta = pd.merge(
    left=positive_perfect,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [None]:
positive_perfect_meta['taxon2label'] = pd.to_numeric(positive_perfect_meta['taxon2']).map(labels_index)

In [None]:
positive_perfect_meta

In [None]:
print("{}% of content has been tagged to taxons which have perfect performance".format(
    positive_perfect.content_id.nunique()/keep_perfect_performing_taxons.shape[0]*100)
)

quick browse but this looks good

###  80 < taxon F1 <1 = good performance!

In [None]:
good_performance = dev_metrics[(dev_metrics['f1']<1) & (dev_metrics['f1']>0.8)].taxon2code.unique().astype(str)

In [None]:
keep_good_performing_taxons = prob_dev[good_performance].copy()

In [None]:
keep_good_performing_taxons.shape

In [None]:
if dev['content_id'].shape[0] == keep_good_performing_taxons.shape[0]:
    keep_good_performing_taxons['content_id'] = dev['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
good_by_id = pd.melt(keep_good_performing_taxons, 
                        id_vars=['content_id'], var_name='taxon2', value_name='prob')

In [None]:
good_by_id.shape

In [None]:
positive_good = good_by_id[good_by_id['prob']>0.5].copy()

In [None]:
positive_good.shape

In [None]:
positive_good.content_id.nunique()

In [None]:
print("{}% of content has been tagged to taxons which have good performance".format(
    positive_good.content_id.nunique()/keep_good_performing_taxons.shape[0]*100)
)

In [None]:
positive_good_meta = pd.merge(
    left=positive_good,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [None]:
positive_good_meta['taxon2label'] = pd.to_numeric(positive_good_meta['taxon2']).map(labels_index)

In [None]:
positive_good_meta

In [None]:
if dev['content_id'].shape[0] == keep_perfect_performing_taxons.shape[0]:
    keep_perfect_performing_taxons['content_id'] = dev['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
no_auto_content['support'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
f1_byclass = pd.DataFrame((a)[2], columns=['f1'])

support_byclass = pd.DataFrame((a)[3], columns=['support'])

f1_byclass = pd.merge(
    left=f1_byclass, 
    right=support_byclass, 
    left_index=True,
    right_index=True,
    how='outer', 
    validate='one_to_one'
)

f1_byclass['index_col'] = f1_byclass.index

f1_byclass['level2taxon'] = f1_byclass['index_col'].map(labels_index).copy()

print("At p_threshold of {}, there were {} out of {} ({})% taxons with auto-tagged content in the training data"
      .format(P_THRESHOLD, 
              f1_byclass.loc[f1_byclass['f1'] > 0].shape[0], 
              y_pred.shape[1], 
              (f1_byclass.loc[f1_byclass['f1'] > 0].shape[0]/y_pred.shape[1])*100 ))

In [None]:
no_auto_content = f1_byclass.loc[f1_byclass['f1'] == 0]
no_auto_content = no_auto_content.set_index('level2taxon')

In [None]:
classes_predictedto = f1_byclass.loc[f1_byclass['f1'] > 0]
classes_predictedto = classes_predictedto.set_index('level2taxon') 

In [None]:
classes_predictedto.plot.scatter(x='support', y='f1', figsize=(20, 10), xticks=np.arange(0, 9700, 100))

In [None]:
classes_predictedto['f1'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_train, y_pred, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_train, y_pred, average='macro', sample_weight=None)

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_train, y_pred, average='weighted', sample_weight=None)

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_dev, y_pred_dev, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='macro', sample_weight=None)

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='weighted', sample_weight=None)

## LOAD More data

In [None]:
if train['content_id'].shape[0] == true_train.shape[0]:
    true_train['content_id'] = train['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
if dev['content_id'].shape[0] == true_dev.shape[0]:
    true_dev['content_id'] = dev['content_id']
else:
    print("warning: true_dev and content_id may not originate from same data")

## Taxons not represented in training/dev

In [None]:
train_by_id = pd.melt(true_train, id_vars=['content_id'], var_name='taxon2', value_name='tagged')
positive_taxons_true_train = train_by_id[train_by_id['tagged']==1].copy()

In [None]:
dev_by_id = pd.melt(true_dev, id_vars=['content_id'], var_name='taxon2', value_name='tagged')
positive_taxons_true_dev = dev_by_id[dev_by_id['tagged']==1].copy()

In [None]:
positive_taxons_true_train['taxon2label'] = pd.to_numeric(positive_taxons_true_train['taxon2']).map(labels_index)
positive_taxons_true_dev['taxon2label'] = pd.to_numeric(positive_taxons_true_dev['taxon2']).map(labels_index)

In [None]:
positive_taxons_true_train.taxon2.nunique()

In [None]:
positive_taxons_true_dev.taxon2.nunique()

In [None]:
positive_taxons_true_train.taxon2.nunique() - positive_taxons_true_dev.taxon2.nunique()

In [None]:
not_in_dev = np.setdiff1d(positive_taxons_true_train.taxon2label, positive_taxons_true_dev.taxon2label)

np.setdiff1d(positive_taxons_true_train.taxon2label, positive_taxons_true_dev.taxon2label)

In [None]:
positive_taxons_true_train[positive_taxons_true_train['taxon2label'].isin(not_in_dev)].groupby('taxon2label').size().sort_values(ascending=False)

In [None]:
true_train = true_train.drop_duplicates(subset='content_id')

In [None]:
train_true = pd.merge(
    left=true_train,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='1:m'
)

In [None]:
true_train.shape

In [None]:
train_true.groupby('_merge').size().sort_values(ascending=True)

In [None]:
train_true.shape

In [None]:
train_true.level2taxon.nunique()

In [None]:
labelled_level2.level2taxon.nunique()

In [None]:
pred_dev.shape

In [None]:
print(actual_train.shape)
actual_train.head()

In [None]:
print(pred_train.shape)
pred_train.head()

In [None]:
true_train.shape

In [None]:
pred_train.shape

### Produce some dummy results for testing.

In [None]:
array = np.random.randint(2,size=(100000,420))

In [None]:
df = pd.DataFrame(data=array.tolist(),columns=[i for i in range(1,421)])

In [None]:
df.head()

In [None]:
df.to_csv(os.path.join(DATADIR, 'dummy_results.csv.gz'),compression='gzip',index=False)