# Setup

In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv
import json
import functools
import time
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898

In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

# Data

In [5]:
store = pd.HDFStore('../data_prep/processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']

In [6]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,ID,Gene,Variation,Class,Sentences
0,0,[fam58a],"[truncating, mutations]",1,"[[cyclin-dependent, kinases, , cdks, , regulat..."
1,1,[cbl],[w802*],2,"[[abstract, background, non-small, cell, lung,..."
2,2,[cbl],[q249e],2,"[[abstract, background, non-small, cell, lung,..."
3,3,[cbl],[n454d],3,"[[recent, evidence, has, demonstrated, that, a..."
4,4,[cbl],[l399v],4,"[[oncogenic, mutations, in, the, monomeric, ca..."


Unnamed: 0,ID,Gene,Variation,Sentences
0,0,[acsl4],[r570s],"[[2, this, mutation, resulted, in, a, myelopro..."
1,1,[naglu],[p521l],"[[abstract, the, large, tumor, suppressor, 1, ..."
2,2,[pah],[l333f],"[[vascular, endothelial, growth, factor, recep..."
3,3,[ing1],[a148d],"[[inflammatory, myofibroblastic, tumor, , imt,..."
4,4,[tmem216],[g77a],"[[abstract, retinoblastoma, is, a, pediatric, ..."


In [7]:
with open('../data_prep/processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (vocab_words, vocab_wordidx) = pickle.load(f)
vocab_size = len(vocab_words)
vocab_size, len(vocab_wordidx)

(352220, 352220)

## Train-Test Split and Data Prep

In [8]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import make_scorer, f1_score, precision_score, accuracy_score, log_loss
f1_scorer = make_scorer(f1_score, average="macro")
precision_scorer = make_scorer(precision_score, average="macro")
accuracy_scorer = make_scorer(accuracy_score, average="macro")
log_loss_scorer = make_scorer(log_loss)

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
train_df.Sentences = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
all_text_train_df = pd.DataFrame()
all_text_train_df["Text"] = train_df.Gene + train_df.Variation + train_df.Sentences
all_text_train_df["Class"] = train_df.Class
display(all_text_train_df.head())

Unnamed: 0,Text,Class
0,"[fam58a, truncating, mutations, cyclin-depende...",1
1,"[cbl, w802*, abstract, background, non-small, ...",2
2,"[cbl, q249e, abstract, background, non-small, ...",2
3,"[cbl, n454d, recent, evidence, has, demonstrat...",3
4,"[cbl, l399v, oncogenic, mutations, in, the, mo...",4


In [10]:
x_train, x_test, y_train, y_test = train_test_split(all_text_train_df.Text,all_text_train_df.Class,
                                                   test_size=0.10, random_state=random_state_number,
                                                   stratify=all_text_train_df.Class)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(2988,) (2988,)
(333,) (333,)


In [11]:
del all_text_train_df
del train_df
#del test_df

In [12]:
cvec = CountVectorizer(vocabulary=vocab_wordidx)
tfidf = TfidfTransformer()

In [13]:
x_train = x_train.str.join(" ")
x_train_counts = cvec.fit_transform(x_train, y_train)
print(x_train_counts.shape)
x_train_tf = tfidf.fit_transform(x_train_counts)
print(x_train_tf.shape)

(2988, 352220)
(2988, 352220)


In [14]:
x_test = x_test.str.join(" ")
x_test_counts = cvec.fit_transform(x_test, y_test)
print(x_test_counts.shape)
x_test_tf = tfidf.fit_transform(x_test_counts)
print(x_test_tf.shape)

(333, 352220)
(333, 352220)


In [15]:
len(x_test)

333

In [16]:
gc.collect()

197

In [17]:
def evaluate_model(model, sparse=True, predict_proba=True):
    if sparse:
        x_train_loc = x_train_tf
        y_train_loc = y_train
        x_test_loc  = x_test_tf
        y_test_loc  = y_test
    else:
        x_train_loc = x_train_tf.toarray()
        y_train_loc = y_train.values
        x_test_loc  = x_test_tf.toarray()
        y_test_loc  = y_test.values
        
    model.fit(x_train_loc, y_train_loc)        
    if predict_proba and 'predict_proba' in dir(model):
        predicted_prob = model.predict_proba(x_test_loc)
        print("log_loss\n", log_loss(y_test_loc, predicted_prob, labels=range(1,10)))
        
    y_pred = model.predict(x_test_loc)
    print("f1_score\n", f1_score(y_test_loc, y_pred, average="macro"))
    print("accuracy_score\n", accuracy_score(y_test_loc, y_pred))
    print("\nclassification_report\n",classification_report(y_test_loc, y_pred))
    print("\nconfusion_matrix\n",confusion_matrix(y_test_loc, y_pred))

# Models

did not run GaussianProcessClassifier, GaussianMixture since they took more than 64G of ram

## Multinomial NB

In [18]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB(alpha=0.001)

In [19]:
evaluate_model(nb_model)

log_loss
 1.7626723557
f1_score
 0.581177607313
accuracy_score
 0.663663663664

classification_report
              precision    recall  f1-score   support

          1       0.61      0.58      0.59        57
          2       0.64      0.64      0.64        45
          3       0.45      0.56      0.50         9
          4       0.63      0.59      0.61        69
          5       0.50      0.62      0.56        24
          6       0.84      0.57      0.68        28
          7       0.75      0.83      0.79        95
          8       0.00      0.00      0.00         2
          9       1.00      0.75      0.86         4

avg / total       0.67      0.66      0.66       333


confusion_matrix
 [[33  0  2 17  4  0  1  0  0]
 [ 0 29  0  1  1  0 14  0  0]
 [ 0  0  5  1  1  0  2  0  0]
 [16  0  1 41  6  2  3  0  0]
 [ 1  1  0  3 15  1  3  0  0]
 [ 3  4  0  1  2 16  2  0  0]
 [ 1 10  3  1  1  0 79  0  0]
 [ 0  1  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0  1  0  3]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [19]:
parameters = { 'alpha': [0.0005, 0.001,0.002,0.01,0.02]  }

nb_gs = GridSearchCV(nb_model, parameters)

In [None]:
evaluate_model(nb_gs)

In [None]:
pd.DataFrame(nb_gs.cv_results_).transpose()

## Support Vector Machine

In [20]:
from sklearn.svm import SVC
svc_model = SVC(probability=True)

In [21]:
evaluate_model(svc_model)

log_loss
 1.88764903973
f1_score
 0.0493250259605
accuracy_score
 0.285285285285

classification_report
              precision    recall  f1-score   support

          1       0.00      0.00      0.00        57
          2       0.00      0.00      0.00        45
          3       0.00      0.00      0.00         9
          4       0.00      0.00      0.00        69
          5       0.00      0.00      0.00        24
          6       0.00      0.00      0.00        28
          7       0.29      1.00      0.44        95
          8       0.00      0.00      0.00         2
          9       0.00      0.00      0.00         4

avg / total       0.08      0.29      0.13       333


confusion_matrix
 [[ 0  0  0  0  0  0 57  0  0]
 [ 0  0  0  0  0  0 45  0  0]
 [ 0  0  0  0  0  0  9  0  0]
 [ 0  0  0  0  0  0 69  0  0]
 [ 0  0  0  0  0  0 24  0  0]
 [ 0  0  0  0  0  0 28  0  0]
 [ 0  0  0  0  0  0 95  0  0]
 [ 0  0  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  4  0  0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [22]:
parameters = {'kernel':('linear', 'rbf'), 
              'C':[1,2,3,4,5,6,7,8,9,10], 
              'gamma':[0.01,0.02,0.03,0.04,0.1,0.2],
              'degree':[3,4,5,6]}

svc_rs = RandomizedSearchCV(svc_model, parameters, n_iter=20, n_jobs=10)

In [None]:
evaluate_model(svc_rs)

In [None]:
pd.DataFrame(svc_rs.cv_results_).transpose()

## Softmax Regression

In [22]:
from sklearn.linear_model import LogisticRegression
smreg_model = LogisticRegression(multi_class="multinomial", solver='lbfgs')

In [23]:
evaluate_model(smreg_model)

log_loss
 1.11772512579
f1_score
 0.457444634881
accuracy_score
 0.633633633634

classification_report
              precision    recall  f1-score   support

          1       0.58      0.54      0.56        57
          2       0.85      0.38      0.52        45
          3       0.00      0.00      0.00         9
          4       0.63      0.70      0.66        69
          5       0.55      0.25      0.34        24
          6       0.78      0.50      0.61        28
          7       0.61      0.98      0.75        95
          8       0.00      0.00      0.00         2
          9       1.00      0.50      0.67         4

avg / total       0.64      0.63      0.60       333


confusion_matrix
 [[31  0  0 16  2  1  7  0  0]
 [ 0 17  0  2  0  0 26  0  0]
 [ 0  0  0  2  1  0  6  0  0]
 [11  0  0 48  2  3  5  0  0]
 [ 4  1  0  6  6  0  7  0  0]
 [ 7  1  0  1  0 14  5  0  0]
 [ 0  1  0  1  0  0 93  0  0]
 [ 0  0  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  2  0  2]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [25]:
parameters ={
#                 'penalty' : ['l1','l2'],
                'tol' : [1e-5, 1e-4, 1e-3],
                'class_weight' : [None, 'balanced'],
                'max_iter' : [100,150,200,250,300],
#                 'solver' : ['newton-cg', 'lbfgs'], 
                'C':[1,2,3,4,5,6,7,8,9,10], 
            }

smreg_rs = RandomizedSearchCV(smreg_model, parameters, n_jobs=10)

In [None]:
evaluate_model(smreg_rs)

In [None]:
pd.DataFrame(smreg_rs.cv_results_).transpose()

## K Nearest Neighbour

In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3)

In [25]:
evaluate_model(knn_model)

log_loss
 8.1949303197
f1_score
 0.477345011858
accuracy_score
 0.612612612613

classification_report
              precision    recall  f1-score   support

          1       0.54      0.47      0.50        57
          2       0.54      0.49      0.51        45
          3       0.00      0.00      0.00         9
          4       0.64      0.77      0.70        69
          5       0.36      0.21      0.26        24
          6       0.94      0.61      0.74        28
          7       0.65      0.81      0.72        95
          8       0.00      0.00      0.00         2
          9       1.00      0.75      0.86         4

avg / total       0.60      0.61      0.60       333


confusion_matrix
 [[27  4  1 19  3  0  3  0  0]
 [ 2 22  1  1  1  0 18  0  0]
 [ 0  0  0  1  2  0  6  0  0]
 [11  0  2 53  1  0  2  0  0]
 [ 5  1  0  6  5  0  7  0  0]
 [ 3  2  0  1  2 17  3  0  0]
 [ 2 12  2  2  0  0 77  0  0]
 [ 0  0  0  0  0  1  1  0  0]
 [ 0  0  0  0  0  0  1  0  3]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [28]:
parameters = {'n_neighbors':[3,4,5], 
              'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 
              'leaf_size':[30,35,40,45,50]
             }

knn_rs = RandomizedSearchCV(knn_model, parameters, n_jobs=10, n_iter=30)


In [None]:
evaluate_model(knn_rs)

In [None]:
pd.DataFrame(knn_rs.cv_results_).transpose()

## Passive Aggresive Classifier

In [26]:
from sklearn.linear_model import PassiveAggressiveClassifier
pag_model = PassiveAggressiveClassifier()



In [27]:
evaluate_model(pag_model, predict_proba=False)

f1_score
 0.534271866501
accuracy_score
 0.657657657658

classification_report
              precision    recall  f1-score   support

          1       0.61      0.61      0.61        57
          2       0.61      0.60      0.61        45
          3       0.12      0.11      0.12         9
          4       0.64      0.67      0.65        69
          5       0.56      0.42      0.48        24
          6       0.78      0.64      0.71        28
          7       0.73      0.83      0.78        95
          8       0.00      0.00      0.00         2
          9       1.00      0.75      0.86         4

avg / total       0.65      0.66      0.65       333


confusion_matrix
 [[35  0  3 15  2  1  1  0  0]
 [ 0 27  1  2  1  1 13  0  0]
 [ 0  0  1  1  1  0  6  0  0]
 [15  0  1 46  3  1  3  0  0]
 [ 4  1  0  5 10  1  3  0  0]
 [ 3  5  0  1  0 18  1  0  0]
 [ 0 11  2  2  1  0 79  0  0]
 [ 0  0  0  0  0  1  1  0  0]
 [ 0  0  0  0  0  0  1  0  3]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [31]:
parameters = {
              'C':[1,2,3,4,5,6,7,8,9,10], 
              'tol' : [1e-5, 1e-4, 1e-3],
              'max_iter' : [800, 1000,1200],
              'loss':['hinge', 'squared_hinge']}

pag_rs = RandomizedSearchCV(pag_model, parameters, n_iter=100, n_jobs=-1)


In [None]:
evaluate_model(pag_rs, predict_proba=False)

In [None]:
pd.DataFrame(pag_rs.cv_results_).transpose()

## Quadratic Discriminant Analysis

In [28]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_model = QuadraticDiscriminantAnalysis()

In [29]:
evaluate_model(qda_model, sparse=False)



log_loss
 29.4564939824
f1_score
 0.19833149647
accuracy_score
 0.147147147147

classification_report
              precision    recall  f1-score   support

          1       0.45      0.09      0.15        57
          2       0.31      0.20      0.24        45
          3       0.03      0.67      0.05         9
          4       0.00      0.00      0.00        69
          5       0.07      0.04      0.05        24
          6       0.45      0.36      0.40        28
          7       0.73      0.17      0.27        95
          8       0.20      0.50      0.29         2
          9       0.50      0.25      0.33         4

avg / total       0.38      0.15      0.18       333


confusion_matrix
 [[ 5  5 39  0  3  2  0  3  0]
 [ 0  9 30  0  2  1  3  0  0]
 [ 0  1  6  0  0  1  1  0  0]
 [ 2  1 62  0  1  0  1  1  1]
 [ 4  3 11  0  1  4  1  0  0]
 [ 0  5 10  0  3 10  0  0  0]
 [ 0  5 66  0  5  3 16  0  0]
 [ 0  0  0  0  0  1  0  1  0]
 [ 0  0  3  0  0  0  0  0  1]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Decision Trees Classifier

In [30]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

In [31]:
evaluate_model(dt_model)

log_loss
 14.2096467451
f1_score
 0.453594264867
accuracy_score
 0.588588588589

classification_report
              precision    recall  f1-score   support

          1       0.65      0.60      0.62        57
          2       0.41      0.40      0.40        45
          3       0.00      0.00      0.00         9
          4       0.64      0.70      0.67        69
          5       0.39      0.38      0.38        24
          6       0.67      0.50      0.57        28
          7       0.64      0.74      0.68        95
          8       0.00      0.00      0.00         2
          9       0.75      0.75      0.75         4

avg / total       0.57      0.59      0.58       333


confusion_matrix
 [[34  2  0 15  2  1  3  0  0]
 [ 4 18  0  1  4  2 15  0  1]
 [ 0  0  0  1  2  0  6  0  0]
 [ 9  1  1 48  3  1  5  1  0]
 [ 1  0  0  6  9  1  7  0  0]
 [ 1  6  0  2  2 14  3  0  0]
 [ 2 16  2  2  1  2 70  0  0]
 [ 0  1  0  0  0  0  1  0  0]
 [ 1  0  0  0  0  0  0  0  3]]


In [36]:
parameters = {
              'criterion':['gini','entropy'], 
              'splitter':['best','random'], 
              'max_depth':[None, 4, 6, 8, 10], 
              'min_samples_split':[2,4,6,8],
              'min_samples_leaf':[1, 3, 5, 8, 10], 
              'min_weight_fraction_leaf':[0, 0.1, 0.01, 0.5, 0.001], 
              'max_features':['log2','auto'], 
              'max_leaf_nodes':[None, 4, 8, 16, 32],
              'min_impurity_decrease':[0, 1e-3, 1e-4, 1e-5], 
              'class_weight':['balanced',None],
#               'presort':[True, False] 
}
dt_rs = RandomizedSearchCV(dt_model, parameters, n_iter=100, n_jobs=-1)


In [None]:
evaluate_model(dt_rs)

In [None]:
pd.DataFrame(dt_rs.cv_results_).transpose()

## AdaBoost Classifier

In [32]:
from sklearn.ensemble import AdaBoostClassifier
adab_model = AdaBoostClassifier()

In [33]:
evaluate_model(adab_model)

log_loss
 2.03696523324
f1_score
 0.171401646976
accuracy_score
 0.408408408408

classification_report
              precision    recall  f1-score   support

          1       0.00      0.00      0.00        57
          2       0.00      0.00      0.00        45
          3       0.00      0.00      0.00         9
          4       0.31      0.84      0.45        69
          5       0.00      0.00      0.00        24
          6       0.00      0.00      0.00        28
          7       0.54      0.80      0.65        95
          8       0.00      0.00      0.00         2
          9       0.40      0.50      0.44         4

avg / total       0.22      0.41      0.28       333


confusion_matrix
 [[ 0  0  0 52  0  0  5  0  0]
 [ 0  0  0 20  0  0 25  0  0]
 [ 0  0  0  3  0  0  6  0  0]
 [ 0  0  0 58  0  0  8  0  3]
 [ 0  0  0 16  0  0  8  0  0]
 [ 0  0  0 19  0  0  9  0  0]
 [ 0  0  0 19  0  0 76  0  0]
 [ 0  0  0  0  0  0  2  0  0]
 [ 0  0  0  1  0  0  1  0  2]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [39]:
parameters = {
              # adaboost
              'n_estimators' :[50, 100, 150, 200, 500],
              'learning_rate':[1e-4, 1e-3, 1e-2, 1e-1],
              'algorithm' : ['SAMME', 'SAMME.R'],
}
adab_rs = RandomizedSearchCV(adab_model, parameters, n_iter=25, n_jobs=-1)


In [None]:
evaluate_model(adab_rs)

In [None]:
pd.DataFrame(adab_rs.cv_results_).transpose()

## Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()

In [35]:
evaluate_model(rf_model)

log_loss
 3.54088132013
f1_score
 0.496682792489
accuracy_score
 0.633633633634

classification_report
              precision    recall  f1-score   support

          1       0.64      0.63      0.64        57
          2       0.64      0.51      0.57        45
          3       0.00      0.00      0.00         9
          4       0.68      0.71      0.70        69
          5       0.41      0.29      0.34        24
          6       0.84      0.57      0.68        28
          7       0.60      0.81      0.69        95
          8       0.00      0.00      0.00         2
          9       1.00      0.75      0.86         4

avg / total       0.62      0.63      0.62       333


confusion_matrix
 [[36  4  0 10  2  0  5  0  0]
 [ 3 23  0  1  2  0 16  0  0]
 [ 0  0  0  1  1  1  6  0  0]
 [ 8  0  0 49  2  1  9  0  0]
 [ 4  0  0  4  7  0  9  0  0]
 [ 3  2  0  3  1 16  3  0  0]
 [ 2  7  2  4  2  1 77  0  0]
 [ 0  0  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  1  0  3]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [42]:
parameters = {
              'n_estimators': [10, 50, 100, 200, 250, 500],
              'criterion':['gini','entropy'], 
              'max_depth':[None, 4, 6, 8, 10], 
              'min_samples_split':[2,4,6,8],
              'min_samples_leaf':[1, 3, 5, 8, 10], 
              'min_weight_fraction_leaf':[0, 0.1, 0.01, 0.5, 0.001], 
              'max_features':['log2','auto'], 
              'max_leaf_nodes':[None, 4, 8, 16, 32],
              'min_impurity_decrease':[0, 1e-3, 1e-4, 1e-5], 
              'class_weight':['balanced',None],
#               'bootstrap':[True, False],
#               'oob_score':[True, False]
#               'presort':[True, False] 
}
rf_rs = RandomizedSearchCV(rf_model, parameters, n_iter=100, n_jobs=-1)


In [None]:
evaluate_model(rf_rs)

In [None]:
pd.DataFrame(rf_rs.cv_results_).transpose()

## Extreme Randomization Trees

In [36]:
from sklearn.ensemble import ExtraTreesClassifier
xtr_model = ExtraTreesClassifier()

In [37]:
evaluate_model(xtr_model)

log_loss
 4.28472314063
f1_score
 0.467986422835
accuracy_score
 0.597597597598

classification_report
              precision    recall  f1-score   support

          1       0.55      0.56      0.56        57
          2       0.56      0.44      0.49        45
          3       0.17      0.11      0.13         9
          4       0.68      0.64      0.66        69
          5       0.21      0.12      0.16        24
          6       0.68      0.61      0.64        28
          7       0.63      0.83      0.71        95
          8       0.00      0.00      0.00         2
          9       1.00      0.75      0.86         4

avg / total       0.58      0.60      0.58       333


confusion_matrix
 [[32  4  0 12  2  2  5  0  0]
 [ 0 20  0  2  2  0 21  0  0]
 [ 0  0  1  1  1  1  5  0  0]
 [12  2  0 44  2  0  9  0  0]
 [ 6  1  1  4  3  5  4  0  0]
 [ 4  2  0  0  3 17  2  0  0]
 [ 4  5  4  2  1  0 79  0  0]
 [ 0  2  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  3]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [45]:
parameters = {
              'n_estimators': [10, 50, 100, 200, 250, 500],
              'criterion':['gini','entropy'], 
              'max_depth':[None, 4, 6, 8, 10], 
              'min_samples_split':[2,4,6,8],
              'min_samples_leaf':[1, 3, 5, 8, 10], 
              'min_weight_fraction_leaf':[0, 0.1, 0.01, 0.5, 0.001], 
              'max_features':['log2','auto'], 
              'max_leaf_nodes':[None, 4, 8, 16, 32],
              'min_impurity_decrease':[0, 1e-3, 1e-4, 1e-5], 
              'class_weight':['balanced',None],
#               'bootstrap':[True, False],
#               'oob_score':[True, False]
#               'presort':[True, False] 
}
xtr_rs = RandomizedSearchCV(xtr_model, parameters, n_iter=100, n_jobs=-1)


In [None]:
evaluate_model(xtr_rs)

In [None]:
pd.DataFrame(xtr_rs.cv_results_).transpose()

## Saving models run

### stage1

In [38]:
pickle.dump(nb_model, open('stage1/nb_model', 'wb'))
pickle.dump(svc_model, open('stage1/svc_model', 'wb'))
pickle.dump(smreg_model, open('stage1/smreg_model', 'wb'))
pickle.dump(knn_model, open('stage1/knn_model', 'wb'))
pickle.dump(pag_model, open('stage1/pag_model', 'wb'))
pickle.dump(qda_model, open('stage1/qda_model', 'wb'))
pickle.dump(dt_model, open('stage1/qda_model', 'wb'))
pickle.dump(adab_model, open('stage1/adab_model', 'wb'))
pickle.dump(rf_model, open('stage1/rf_model', 'wb'))
pickle.dump(xtr_model, open('stage1/xtr_model', 'wb'))

In [39]:
from sklearn.ensemble import VotingClassifier

## Load classifiers

In [40]:
nb_model    = pickle.load(open('stage1/nb_model', 'rb'))
svc_model   = pickle.load(open('stage1/svc_model', 'rb'))
smreg_model = pickle.load(open('stage1/smreg_model', 'rb'))
knn_model   = pickle.load(open('stage1/knn_model', 'rb'))
pag_model   = pickle.load(open('stage1/pag_model', 'rb'))
qda_model   = pickle.load(open('stage1/qda_model', 'rb'))
dt_model    = pickle.load(open('stage1/qda_model', 'rb'))
adab_model  = pickle.load(open('stage1/adab_model', 'rb'))
rf_model    = pickle.load(open('stage1/rf_model', 'rb'))
xtr_model   = pickle.load(open('stage1/xtr_model', 'rb'))

## Average on probabilities

In [41]:
estimators_returning_probabilities = [
                    ("NB", nb_model),           
                    ("SVM", svc_model),         
                    ("Softmax", smreg_model),  
                    ("KNN", knn_model),         
#                     ("PasAgg", pag_model),      
#                     ("QDA", qda_model),         
                    ("DecisionTree", dt_model), 
                    ("ADABoost", adab_model),   
                    ("RandomForest", rf_model),
                    ("ExtremeRand", xtr_model)
                 ]

In [42]:
def average_predicted_probabilities(local_data):
    nof_data = local_data.shape[0]
    mean_probs = np.zeros((nof_data, 9))
    for name, estimator in estimators_returning_probabilities:
        mean_probs += estimator.predict_proba(local_data)
    return mean_probs/nof_data

In [43]:
y_pred = average_predicted_probabilities(x_test_tf)
print("log_loss\n", log_loss(y_test, y_pred, labels=range(1,10)))

log_loss
 1.01227591243


# Submission

## Test Predictions

### Data Prep

In [44]:
test_df.Sentences = test_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
all_text_test_df = pd.DataFrame()
all_text_test_df["Text"] = test_df.Gene + test_df.Variation + test_df.Sentences
display(all_text_test_df.head())

Unnamed: 0,Text
0,"[acsl4, r570s, 2, this, mutation, resulted, in..."
1,"[naglu, p521l, abstract, the, large, tumor, su..."
2,"[pah, l333f, vascular, endothelial, growth, fa..."
3,"[ing1, a148d, inflammatory, myofibroblastic, t..."
4,"[tmem216, g77a, abstract, retinoblastoma, is, ..."


In [45]:
test_data = all_text_test_df.Text.str.join(" ")
test_data_counts = cvec.fit_transform(test_data)
print(test_data_counts.shape)
test_data_tf = tfidf.fit_transform(test_data_counts)
print(test_data_tf.shape)

(5668, 352220)
(5668, 352220)


### Run Model

#### average probs

In [46]:
y_pred = average_predicted_probabilities(test_data_tf)

## formating final output

In [47]:
""" Submission """
submission = pd.DataFrame(y_pred)
submission['id'] = test_df.ID
submission.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
submission.to_csv('sklearn_classifiers_submission_{}.csv'.format(dt.datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

## submitted results

with data from python2

one hot representations: from vcs_all_models

sklearn_classifiers_submission_20170922_114137.csv : 11.77458

soft probabilities: from average_predicted_probabilities

sklearn_classifiers_submission_20170922_155723: 0.97035

with python3 data cleanup:

sklearn_classifiers_submission_20170923_162728: 0.97116