In [51]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical, layer_utils, plot_model

from keras.layers import (Embedding, Input, Dense, Dropout, 
                          Activation, Conv1D, MaxPooling1D, Flatten, concatenate, Reshape)
from keras.models import Model, Sequential
from keras.optimizers import rmsprop
from keras.callbacks import TensorBoard, Callback, ModelCheckpoint
import keras.backend as K
from keras.losses import binary_crossentropy

from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.utils import class_weight

import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py


### Environmental vars

In [52]:
DATADIR=os.getenv('DATADIR')

### Load in data

In [73]:
actual_train = pd.read_csv(os.path.join(DATADIR, 'true_train_1601_1902__results.csv.gz'), dtype=float, compression='gzip')
pred_train = pd.read_csv(os.path.join(DATADIR, 'train_1601_1902__results.csv.gz'), dtype=float, compression='gzip')

In [71]:
actual_dev = pd.read_csv(os.path.join(DATADIR, 'true_dev_1601_1902__results.csv.gz'), dtype=float, compression='gzip')
pred_dev = pd.read_csv(os.path.join(DATADIR, 'dev_1601_1902__results.csv.gz'), dtype=float, compression='gzip')

In [72]:
pred_dev.shape

(150870, 210)

In [54]:
print(actual_train.shape)
actual_train.head()

(150870, 210)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,201,202,203,204,205,206,207,208,209,210
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
print(pred_train.shape)
pred_train.head()

(150870, 210)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,201,202,203,204,205,206,207,208,209,210
0,8.183496935654233e-11,1.6218313358962178e-09,1.4613300153287767e-11,1.667332014676504e-07,1.4085937277696914e-12,1.9712855348075212e-08,7.860534445569556e-09,4.547738683413627e-07,7.371689925150803e-10,2.664921451914215e-12,...,9.483711593816224e-09,1.1044076764221702e-09,1.3592635639270156e-07,0.0009853303199633,8.153528519017073e-09,2.245691479996204e-11,5.032197805121541e-07,5.086257193198662e-09,2.4391811237478578e-08,9.235110098870793e-08
1,6.143640707279019e-09,1.609409605585199e-09,2.219857151430915e-06,5.186715412719423e-08,3.629608169575249e-09,2.2545789235550728e-08,7.666207113565803e-10,0.0128247449174523,7.678839786251501e-09,5.356260857070083e-08,...,0.0013700525742024,1.0863221433510262e-08,1.7943040830914473e-08,0.206178531050682,4.368725381453942e-09,1.7673301710985356e-10,3.350633051013574e-07,1.1168582148002316e-07,2.081082861593586e-09,2.8059798928836703e-09
2,1.0556624019386618e-08,4.336910830460283e-09,5.643264344712407e-08,8.805247489362955e-06,2.3180539709954932e-09,0.0444044359028339,9.200866770697758e-07,3.712892748808372e-06,1.616360517653348e-11,4.941894875092058e-13,...,6.534188656814878e-08,4.596435536872999e-13,3.5774463946758317e-10,1.2296267115630144e-08,3.9188927103417365e-10,1.9014106555914625e-06,2.175161743431242e-12,2.597419612726526e-09,7.26168707387842e-07,7.992358632691321e-07
3,4.480507300048118e-16,6.254949900044715e-13,3.4723674682025975e-19,3.1109181009014726e-08,6.305453549192902e-16,2.384962886903264e-13,4.766586007598798e-15,3.923000591044001e-09,1.6157330485526865e-17,1.264357069267696e-19,...,4.63853626005569e-19,6.688855153362768e-25,6.600055528308568e-21,1.5691751216006904e-15,2.174176991281185e-21,1.3444495300117849e-18,3.959817195008705e-15,4.8388504530945925e-19,1.8224492137619563e-08,5.474849740227938e-11
4,6.014463451720076e-07,1.473561203613194e-11,5.43975708922062e-09,2.1254138449222637e-09,3.68723277688332e-07,1.1986040817646426e-06,3.938664860925201e-07,0.0001575220085214,5.660174995369971e-09,6.379683981094787e-08,...,1.1958728691752183e-06,4.683892829149272e-09,1.925659915968936e-07,0.0001293623790843,2.859527228338266e-07,1.6746717790283583e-08,8.0787249112646e-10,1.1590937276650949e-07,2.180911267551933e-12,7.039235327965798e-08


### Evaluate model

#### Training metrics

In [74]:
y_train = actual_train.values
y_prob = pred_train.values

In [75]:
y_train.shape

(150870, 210)

In [76]:
y_prob.shape

(150870, 210)

In [81]:
y_pred = y_prob.copy()

In [82]:
y_pred[y_pred>=P_THRESHOLD] = 1
y_pred[y_pred<P_THRESHOLD] = 0

In [83]:
f1_score(y_train, y_pred, average='micro')

0.9630404578233203

In [84]:
#average= None, the scores for each class are returned.
precision_recall_fscore_support(y_train, y_pred, average=None, sample_weight=None)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(array([1.        , 0.98719317, 0.99543379, 1.        , 1.        ,
        0.97099622, 1.        , 0.95905989, 1.        , 1.        ,
        1.        , 0.95866667, 1.        , 1.        , 0.95441595,
        0.99815157, 0.98888889, 1.        , 1.        , 0.94387755,
        0.95744853, 0.95354082, 0.95501022, 1.        , 1.        ,
        0.9981203 , 1.        , 0.9972973 , 0.99580713, 0.9862543 ,
        0.98790323, 0.99603175, 0.99407115, 0.97691373, 0.93063335,
        0.94373402, 0.97570779, 0.97379913, 0.98256735, 0.96759777,
        1.        , 0.96271186, 0.99643494, 0.99894515, 0.96767001,
        1.        , 1.        , 0.98060942, 0.89180713, 1.        ,
        1.        , 0.96920583, 1.        , 1.        , 1.        ,
        1.        , 0.        , 0.95      , 0.98697068, 0.9823049 ,
        0.99841354, 0.99811321, 0.98918919, 0.96768848, 0.99722607,
        0.97195544, 1.        , 0.99491525, 0.99260355, 1.        ,
        1.        , 1.        , 0.95259096, 0.96

In [85]:
a = precision_recall_fscore_support(y_train, y_pred, average=None, sample_weight=None)
pd.DataFrame(list(a))
f1_byclass = pd.DataFrame((a)[2], columns=['f1'])

support_byclass = pd.DataFrame((a)[3], columns=['support'])

f1_byclass = pd.merge(
    left=f1_byclass, 
    right=support_byclass, 
    left_index=True,
    right_index=True,
    how='outer', 
    validate='one_to_one'
)

f1_byclass['index_col'] = f1_byclass.index

f1_byclass['level2taxon'] = f1_byclass['index_col'].map(labels_index).copy()

print("At p_threshold of {}, there were {} out of {} ({})% taxons with auto-tagged content in the training data"
      .format(P_THRESHOLD, 
              f1_byclass.loc[f1_byclass['f1'] > 0].shape[0], 
              y_pred.shape[1], 
              (f1_byclass.loc[f1_byclass['f1'] > 0].shape[0]/y_pred.shape[1])*100 ))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


NameError: name 'labels_index' is not defined

In [None]:
no_auto_content = f1_byclass.loc[f1_byclass['f1'] == 0]
no_auto_content = no_auto_content.set_index('level2taxon')

In [None]:
no_auto_content['support'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
classes_predictedto = f1_byclass.loc[f1_byclass['f1'] > 0]
classes_predictedto = classes_predictedto.set_index('level2taxon') 

In [None]:
classes_predictedto.plot.scatter(x='support', y='f1', figsize=(20, 10), xticks=np.arange(0, 9700, 100))

In [None]:
classes_predictedto['f1'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_train, y_pred, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_train, y_pred, average='macro', sample_weight=None)

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_train, y_pred, average='weighted', sample_weight=None)

#### Development set metrics

In [65]:
P_THRESHOLD=0.5

In [66]:
y_dev = actual_dev.values
y_pred_dev = pred_dev.values

In [67]:
y_pred_dev[y_pred_dev>=P_THRESHOLD] = 1
y_pred_dev[y_pred_dev<P_THRESHOLD] = 0

In [68]:
#average= None, the scores for each class are returned.
precision_recall_fscore_support(y_dev, y_pred_dev, average=None, sample_weight=None)

ValueError: Found input variables with inconsistent numbers of samples: [9234, 150870]

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_dev, y_pred_dev, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='macro', sample_weight=None)

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='weighted', sample_weight=None)

### Produce some dummy results for testing.

In [45]:
array = np.random.randint(2,size=(100000,420))

In [46]:
df = pd.DataFrame(data=array.tolist(),columns=[i for i in range(1,421)])

In [47]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,411,412,413,414,415,416,417,418,419,420
0,1,1,1,1,0,1,0,1,0,0,...,0,1,0,0,0,1,1,1,1,0
1,1,0,0,0,1,1,0,0,1,0,...,0,1,0,0,1,0,0,1,1,0
2,0,1,1,0,0,1,0,1,0,0,...,0,0,1,1,0,0,1,0,0,0
3,0,1,1,1,0,1,0,1,1,0,...,0,0,1,0,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,1,...,1,1,0,0,1,0,0,0,1,0


In [48]:
df.to_csv(os.path.join(DATADIR, 'dummy_results.csv.gz'),compression='gzip',index=False)