NOTE NB-models don't give us feature_importances_
https://stackoverflow.com/questions/41592661/determining-the-most-contributing-features-for-svm-classifier-in-sklearn

NOTE we can include ELI5 for explanation of predictors
https://github.com/TeamHG-Memex/eli5

NOTE There are other explanation oriented libraries as well
https://github.com/DistrictDataLabs/yellowbrick


In [1]:
# Import the usual suspects.

from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix




import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
sns.set_context('paper')



def print_ln():
    print('-' * 80, '\n')


In [2]:

def model_performance_metrics(model, X, X_test, X_train, y, y_test, y_pred, detailed= False, show_feature_importances= True):

    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print_ln()
    

    if show_feature_importances:

        feature_importances = pd.DataFrame(model.feature_importances_,
                                               index =  X_train.columns,
                                               columns=['importance']).sort_values('importance', ascending=False)



        print("=== Feature Importances ===")
        print(feature_importances)


    
    if detailed:
        model_score = cross_val_score(model, X, y, cv=10)


        print("=== Confusion Matrix ===")
        print(confusion_matrix(y_test, y_pred))
        print_ln()

        print("=== Classification Report ===")
        print(classification_report(y_test, y_pred))
        print_ln()

        print("=== All AUC Scores ===")
        print(model_score)

        print_ln()

        print("=== Mean AUC Score ===")
        print(model_score.mean())
        print_ln()





In [3]:
mono_resistance_df_filledna = pd.read_csv("../data/processed/mono_resistance_df_filledna.csv").set_index('SampleID')

mono_resistance_df_filledna.head()

Unnamed: 0_level_0,NC000962_3.78,NC000962_3.80,NC000962_3.102,NC000962_3.104,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,NC000962_3.150,NC000962_3.155,...,NC000962_3.4409994,NC000962_3.4410001,NC000962_3.4410033,NC000962_3.4410043,NC000962_3.4410061,NC000962_3.4410065,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4411245,isResistant
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = mono_resistance_df_filledna.loc[:, mono_resistance_df_filledna.columns != 'isResistant']
y = mono_resistance_df_filledna.loc[:, 'isResistant']


In [13]:
X

Unnamed: 0_level_0,NC000962_3.78,NC000962_3.80,NC000962_3.102,NC000962_3.104,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,NC000962_3.150,NC000962_3.155,...,NC000962_3.4409993,NC000962_3.4409994,NC000962_3.4410001,NC000962_3.4410033,NC000962_3.4410043,NC000962_3.4410061,NC000962_3.4410065,NC000962_3.4410066,NC000962_3.4410070,NC000962_3.4411245
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR9224981,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR9224985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR9224986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR9224992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# NOTE: The dtype might be problematic but for now this works!

y.describe().T

count    301.000000
mean       0.800664
std        0.400166
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: isResistant, dtype: float64

In [4]:

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=100)


In [5]:
from sklearn.svm import LinearSVC

model_svm= LinearSVC()

model_svm.fit(X_train, y_train)

y_pred= model_svm.predict(X_test)


model_performance_metrics(model_svm, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)


Accuracy: 0.7362637362637363
Mean Absolute Error: 0.26373626373626374
Mean Squared Error: 0.26373626373626374
Root Mean Squared Error: 0.5135525910130955
-------------------------------------------------------------------------------- 



In [6]:
from sklearn.ensemble import RandomForestClassifier



model_rf= RandomForestClassifier(n_estimators= 100,
                                  random_state = 100,
                                  max_depth=5,
                                  min_samples_leaf=50,
                                  min_samples_split=50)

model_rf.fit(X_train, y_train)

y_pred= model_rf.predict(X_test)

model_performance_metrics(model_rf, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.8021978021978022
Mean Absolute Error: 0.1978021978021978
Mean Squared Error: 0.1978021978021978
Root Mean Squared Error: 0.4447495899966607
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.3478244      0.1875
NC000962_3.1480184      0.1250
NC000962_3.1480945      0.0625
NC000962_3.3478245      0.0625
NC000962_3.3843572      0.0625
...                        ...
NC000962_3.1398290      0.0000
NC000962_3.1398287      0.0000
NC000962_3.1398271      0.0000
NC000962_3.1398251      0.0000
NC000962_3.4411245      0.0000

[118668 rows x 1 columns]


In [7]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb= GradientBoostingClassifier(
                                     n_estimators= 100,
                                     random_state = 100,
                                     max_depth=5
                                    )

model_gb.fit(X_train, y_train)

y_pred= model_gb.predict(X_test)


model_performance_metrics(model_gb, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.7692307692307693
Mean Absolute Error: 0.23076923076923078
Mean Squared Error: 0.23076923076923078
Root Mean Squared Error: 0.4803844614152614
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.1637145    0.057574
NC000962_3.3750587    0.040738
NC000962_3.3735813    0.039937
NC000962_3.1637035    0.039575
NC000962_3.3941568    0.031643
...                        ...
NC000962_3.1400440    0.000000
NC000962_3.1400437    0.000000
NC000962_3.1400436    0.000000
NC000962_3.1400423    0.000000
NC000962_3.4411245    0.000000

[118668 rows x 1 columns]


In [8]:
from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import BernoulliNB

model_nb= GaussianNB()
# model_nb= BernoulliNB()

model_nb.fit(X_train, y_train)

y_pred= model_nb.predict(X_test)


model_performance_metrics(model_nb, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

Accuracy: 0.6593406593406593
Mean Absolute Error: 0.34065934065934067
Mean Squared Error: 0.34065934065934067
Root Mean Squared Error: 0.5836602955995385
-------------------------------------------------------------------------------- 



In [9]:
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(
                          solver='lbfgs',
                          alpha=1e-5,
                          hidden_layer_sizes=(5, 2),
                          random_state=1
)



model_mlp.fit(X_train, y_train)

y_pred= model_mlp.predict(X_test)

model_performance_metrics(model_mlp, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

Accuracy: 0.7472527472527473
Mean Absolute Error: 0.25274725274725274
Mean Squared Error: 0.25274725274725274
Root Mean Squared Error: 0.5027397465361703
-------------------------------------------------------------------------------- 



In [10]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
                          learning_rate= 0.01,
                          random_state= 1
)



model_xgb.fit(X_train, y_train)

y_pred= model_xgb.predict(X_test)

model_performance_metrics(model_xgb, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.7912087912087912
Mean Absolute Error: 0.2087912087912088
Mean Squared Error: 0.2087912087912088
Root Mean Squared Error: 0.4569367667316877
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.841353     0.056812
NC000962_3.104941     0.047203
NC000962_3.1637145    0.037451
NC000962_3.333008     0.035202
NC000962_3.1573220    0.034451
...                        ...
NC000962_3.1400077    0.000000
NC000962_3.1400074    0.000000
NC000962_3.1400072    0.000000
NC000962_3.1399635    0.000000
NC000962_3.4411245    0.000000

[118668 rows x 1 columns]


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier


estimators = [
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('svm', LinearSVC()),
    ('mlp', MLPClassifier()),
    ('nb', GaussianNB()),
    ('xgb', XGBClassifier())
]

model_se = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier()
)

model_se.fit(X_train, y_train)

y_pred = model_se.predict(X_test)

model_performance_metrics(model_se, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

Accuracy: 0.7692307692307693
Mean Absolute Error: 0.23076923076923078
Mean Squared Error: 0.23076923076923078
Root Mean Squared Error: 0.4803844614152614
-------------------------------------------------------------------------------- 

