In [3]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

### Environmental vars

In [4]:
DATADIR=os.getenv('DATADIR')

### Load in data

In [6]:
actual_train = pd.read_csv(os.path.join(DATADIR, 'actual_train_1614_2302_results.csv.gz'), dtype=float, compression='gzip')
pred_train = pd.read_csv(os.path.join(DATADIR, 'train_1614_2302_results.csv.gz'), dtype=float, compression='gzip')

In [8]:
actual_dev = pd.read_csv(os.path.join(DATADIR, 'actual_dev_1614_2302_results.csv.gz'), dtype=float, compression='gzip')
pred_dev = pd.read_csv(os.path.join(DATADIR, 'dev_1614_2302_results.csv.gz'), dtype=float, compression='gzip')

In [9]:
pred_dev.shape

(11348, 210)

In [10]:
print(actual_train.shape)
actual_train.head()

(165784, 210)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,201,202,203,204,205,206,207,208,209,210
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
print(pred_train.shape)
pred_train.head()

(165784, 210)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,201,202,203,204,205,206,207,208,209,210
0,2.748645e-16,1.091799e-14,6.177789e-16,4.100739e-15,5.872959e-16,3.013484e-13,8.50949e-11,1.260597e-10,3.919068e-12,6.756921e-17,...,8.663e-12,1.043937e-15,2.016324e-13,2.575809e-05,1.64426e-14,1.562039e-14,1.660904e-16,5.62694e-15,1.884005e-15,1.477973e-10
1,6.518997000000001e-17,1.42308e-11,1.6591040000000003e-17,3.8960539999999996e-19,2.984844e-12,2.816315e-18,6.500217e-19,1.7280530000000001e-18,2.888334e-16,9.132359e-12,...,1.864053e-15,6.683111999999999e-19,1.805826e-16,5.055893e-14,9.062707e-12,7.221974e-09,1.226184e-11,3.747836e-11,3.2821959999999996e-19,1.794174e-16
2,1.633317e-07,1.390263e-06,9.018549e-05,4.185985e-06,2.849058e-07,0.0001118842,0.0004247428,3.076397e-07,1.757552e-06,2.751224e-07,...,6.978184e-06,1.503196e-05,7.38222e-07,4.250382e-07,1.17326e-07,4.551326e-08,8.333716e-08,7.319312e-08,7.204352e-08,7.6148e-07
3,1.578195e-18,9.052343999999999e-19,9.644712000000001e-17,4.21022e-16,2.60103e-19,2.096347e-15,3.340687e-12,7.473112e-13,1.43109e-16,5.653778e-20,...,1.36927e-15,2.089519e-17,1.678481e-18,5.26905e-11,3.86438e-18,5.918924e-18,4.9092209999999996e-20,3.2785919999999995e-19,2.235022e-18,2.268445e-13
4,1.558055e-14,1.182472e-13,1.464557e-13,5.357545e-13,1.086944e-11,2.235208e-12,2.284331e-14,7.25416e-11,3.883278e-12,8.543679e-14,...,7.382635e-14,1.951983e-12,3.134882e-11,4.296812e-13,1.770841e-12,4.982089e-09,5.480763e-15,5.744015e-13,3.249712e-14,2.813652e-11


### Evaluate model

#### Training metrics

In [17]:
P_THRESHOLD=0.5

In [18]:
y_train = actual_train.values
y_prob = pred_train.values

In [19]:
y_train.shape

(165784, 210)

In [20]:
y_prob.shape

(165784, 210)

In [21]:
y_pred = y_prob.copy()

In [22]:
y_pred[y_pred>=P_THRESHOLD] = 1
y_pred[y_pred<P_THRESHOLD] = 0

In [23]:
f1_score(y_train, y_pred, average='micro')

0.87770572023685001

In [24]:
#average= None, the scores for each class are returned.
precision_recall_fscore_support(y_train, y_pred, average=None, sample_weight=None)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(array([ 1.        ,  0.98681055,  0.96958525,  0.94776119,  1.        ,
         0.95205479,  0.98259188,  0.96659919,  0.98825832,  1.        ,
         1.        ,  0.93650794,  1.        ,  1.        ,  0.96341463,
         0.99814126,  0.96880416,  1.        ,  1.        ,  0.91696044,
         0.93473594,  0.90320856,  0.94989562,  1.        ,  1.        ,
         0.9905303 ,  1.        ,  0.99379845,  0.99631676,  0.98211091,
         0.97638889,  0.97830018,  0.9960396 ,  0.94198473,  0.9030224 ,
         0.93067591,  0.91590457,  0.8745098 ,  0.96785714,  0.96666667,
         1.        ,  0.86652079,  0.99820467,  0.99853372,  0.91120219,
         0.99722992,  0.9980198 ,  0.97001764,  0.88741093,  1.        ,
         1.        ,  0.95933014,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  0.93645084,  0.98564593,  0.97889182,
         0.99406688,  0.99240987,  0.98811545,  0.9760274 ,  0.99414062,
         0.95021512,  1.        ,  0.98780488,  0.8

In [None]:
a = precision_recall_fscore_support(y_train, y_pred, average=None, sample_weight=None)
pd.DataFrame(list(a))
f1_byclass = pd.DataFrame((a)[2], columns=['f1'])

support_byclass = pd.DataFrame((a)[3], columns=['support'])

f1_byclass = pd.merge(
    left=f1_byclass, 
    right=support_byclass, 
    left_index=True,
    right_index=True,
    how='outer', 
    validate='one_to_one'
)

f1_byclass['index_col'] = f1_byclass.index

f1_byclass['level2taxon'] = f1_byclass['index_col'].map(labels_index).copy()

print("At p_threshold of {}, there were {} out of {} ({})% taxons with auto-tagged content in the training data"
      .format(P_THRESHOLD, 
              f1_byclass.loc[f1_byclass['f1'] > 0].shape[0], 
              y_pred.shape[1], 
              (f1_byclass.loc[f1_byclass['f1'] > 0].shape[0]/y_pred.shape[1])*100 ))

In [None]:
no_auto_content = f1_byclass.loc[f1_byclass['f1'] == 0]
no_auto_content = no_auto_content.set_index('level2taxon')

In [None]:
no_auto_content['support'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
classes_predictedto = f1_byclass.loc[f1_byclass['f1'] > 0]
classes_predictedto = classes_predictedto.set_index('level2taxon') 

In [None]:
classes_predictedto.plot.scatter(x='support', y='f1', figsize=(20, 10), xticks=np.arange(0, 9700, 100))

In [None]:
classes_predictedto['f1'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_train, y_pred, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_train, y_pred, average='macro', sample_weight=None)

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_train, y_pred, average='weighted', sample_weight=None)

#### Development set metrics

In [25]:
P_THRESHOLD=0.5

In [26]:
y_dev = actual_dev.values
y_pred_dev = pred_dev.values

In [27]:
y_pred_dev[y_pred_dev>=P_THRESHOLD] = 1
y_pred_dev[y_pred_dev<P_THRESHOLD] = 0

In [28]:
#average= None, the scores for each class are returned.
precision_recall_fscore_support(y_dev, y_pred_dev, average=None, sample_weight=None)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(array([ 0.        ,  0.75      ,  0.98360656,  0.5       ,  1.        ,
         0.81818182,  0.33333333,  0.90196078,  0.        ,  1.        ,
         0.        ,  0.75      ,  1.        ,  0.        ,  0.76470588,
         0.        ,  0.88888889,  0.        ,  1.        ,  0.80116959,
         0.87663755,  0.80991736,  0.82938389,  0.        ,  0.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  0.8       ,
         0.85714286,  0.        ,  0.        ,  0.66666667,  0.88358779,
         0.66666667,  0.78731343,  0.5       ,  0.85714286,  1.        ,
         0.        ,  0.82352941,  1.        ,  1.        ,  0.45      ,
         0.33333333,  0.        ,  0.85714286,  0.77070064,  0.        ,
         1.        ,  0.25      ,  1.        ,  0.        ,  0.5       ,
         0.        ,  0.        ,  0.91566265,  0.75      ,  0.85869565,
         0.5       ,  0.8       ,  0.7       ,  0.92857143,  0.        ,
         0.88571429,  0.        ,  0.875     ,  0.5

In [29]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_dev, y_pred_dev, average='micro', sample_weight=None) 

(0.85503646125490829, 0.6401871962560749, 0.73217594181019696, None)

In [30]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='macro', sample_weight=None)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.64408617075606467, 0.42344314263419258, 0.48711680563937582, None)

In [31]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='weighted', sample_weight=None)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.85322542554856917, 0.6401871962560749, 0.71782038259077197, None)

### Produce some dummy results for testing.

In [45]:
array = np.random.randint(2,size=(100000,420))

In [46]:
df = pd.DataFrame(data=array.tolist(),columns=[i for i in range(1,421)])

In [47]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,411,412,413,414,415,416,417,418,419,420
0,1,1,1,1,0,1,0,1,0,0,...,0,1,0,0,0,1,1,1,1,0
1,1,0,0,0,1,1,0,0,1,0,...,0,1,0,0,1,0,0,1,1,0
2,0,1,1,0,0,1,0,1,0,0,...,0,0,1,1,0,0,1,0,0,0
3,0,1,1,1,0,1,0,1,1,0,...,0,0,1,0,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,1,...,1,1,0,0,1,0,0,0,1,0


In [48]:
df.to_csv(os.path.join(DATADIR, 'dummy_results.csv.gz'),compression='gzip',index=False)