In [1]:

# Import the usual suspects.

from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix




import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
sns.set_context('paper')



def print_ln():
    print('-' * 80, '\n')


In [29]:

def model_performance_metrics(model, X, X_test, X_train, y, y_test, y_pred, detailed= False, show_feature_importances= True):

    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print_ln()


    if show_feature_importances:

        feature_importances = pd.DataFrame(model.feature_importances_,
                                               index =  X_train.columns,
                                               columns=['importance']).sort_values('importance', ascending=False)



        print("=== Feature Importances ===")
        print(feature_importances)



    if detailed:
        model_score = cross_val_score(model, X, y, cv=10)


        print("=== Confusion Matrix ===")
        print(confusion_matrix(y_test, y_pred))
        print_ln()

        print("=== Classification Report ===")
        print(classification_report(y_test, y_pred))
        print_ln()

        print("=== All AUC Scores ===")
        print(model_score)

        print_ln()

        print("=== Mean AUC Score ===")
        print(model_score.mean())
        print_ln()


In [4]:
multi_resistance_df_filledna = pd.read_csv("../data/processed/multi_resistance_filledna_df.csv")

multi_resistance_df_filledna.head()

Unnamed: 0.1,Unnamed: 0,SampleID,NC000962_3.78,NC000962_3.80,NC000962_3.102,NC000962_3.104,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,...,amikacin_resistance,kanamycin_resistance,capreomycin_resistance,ethionamide_resistance,para-aminosalicylic_acid_resistance,cycloserine_resistance,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance
0,0,ERR3129939,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,ERR3148148,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,ERR3148149,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,ERR3148151,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,ERR3148153,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
multi_resistance_df_filledna = multi_resistance_df_filledna.drop(columns= ['Unnamed: 0']).set_index('SampleID')
multi_resistance_df_filledna.head()

Unnamed: 0_level_0,NC000962_3.78,NC000962_3.80,NC000962_3.102,NC000962_3.104,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,NC000962_3.150,NC000962_3.155,...,amikacin_resistance,kanamycin_resistance,capreomycin_resistance,ethionamide_resistance,para-aminosalicylic_acid_resistance,cycloserine_resistance,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# X = multi_resistance_df_filledna.loc[:, multi_resistance_df_filledna.columns != 'isResistant']
# y = multi_resistance_df_filledna.loc[:, 'isResistant']

# Experimentation with multi-label datasets


In [24]:
from sklearn.datasets import make_multilabel_classification as make_ml_clf

X, y, p_c, p_w_c = make_ml_clf(n_samples=150,
            n_features=10,
            n_classes=3,
            n_labels=2,
            allow_unlabeled=False,
            return_distributions=True,
            random_state=1234)

X



array([[ 5.,  8.,  3., ...,  3.,  3.,  6.],
       [ 5.,  8.,  6., ...,  5.,  4., 10.],
       [ 2.,  7.,  3., ..., 12.,  5., 10.],
       ...,
       [ 6.,  9.,  2., ...,  6.,  3.,  5.],
       [ 4., 12.,  6., ...,  3.,  1.,  9.],
       [ 2.,  7.,  5., ...,  8.,  4.,  7.]])

In [25]:
y


array([[0, 0, 1],
       [1, 1, 1],
       [0, 1, 1],
       [0, 1, 0],
       [1, 1, 0],
       [1, 0, 0],
       [1, 1, 1],
       [0, 0, 1],
       [0, 1, 1],
       [0, 1, 0],
       [1, 1, 1],
       [0, 1, 1],
       [0, 0, 1],
       [1, 0, 1],
       [1, 1, 1],
       [1, 1, 1],
       [0, 1, 1],
       [0, 1, 1],
       [1, 1, 1],
       [0, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 1],
       [0, 1, 1],
       [0, 1, 1],
       [1, 0, 1],
       [0, 1, 1],
       [0, 1, 1],
       [0, 1, 1],
       [1, 0, 0],
       [0, 1, 1],
       [0, 0, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [0, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [1, 1, 0],
       [0, 1, 1],
       [0, 1, 0],
       [1, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 1],
       [0, 1, 0],
       [1, 1, 0],
       [0, 1, 1],
       [1, 1, 1],
       [0, 1, 1],
       [0,

# Scikit-learn models

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=100)



In [None]:
from sklearn.svm import LinearSVC

model_svm= LinearSVC()

model_svm.fit(X_train, y_train)

y_pred= model_svm.predict(X_test)


model_performance_metrics(model_svm, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)


In [None]:
from sklearn.ensemble import RandomForestClassifier



model_rf= RandomForestClassifier(n_estimators= 100,
                                  random_state = 100,
                                  max_depth=5,
                                  min_samples_leaf=50,
                                  min_samples_split=50)

model_rf.fit(X_train, y_train)

y_pred= model_rf.predict(X_test)

model_performance_metrics(model_rf, X, X_test, X_train, y, y_test, y_pred)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb= GradientBoostingClassifier(
                                     n_estimators= 100,
                                     random_state = 100,
                                     max_depth=5
                                    )

model_gb.fit(X_train, y_train)

y_pred= model_gb.predict(X_test)


model_performance_metrics(model_gb, X, X_test, X_train, y, y_test, y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import BernoulliNB

model_nb= GaussianNB()
# model_nb= BernoulliNB()

model_nb.fit(X_train, y_train)

y_pred= model_nb.predict(X_test)


model_performance_metrics(model_nb, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

In [None]:
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(
                          solver='lbfgs',
                          alpha=1e-5,
                          hidden_layer_sizes=(5, 2),
                          random_state=1
)



model_mlp.fit(X_train, y_train)

y_pred= model_mlp.predict(X_test)

model_performance_metrics(model_mlp, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

In [None]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
                          learning_rate= 0.01,
                          random_state= 1
)



model_xgb.fit(X_train, y_train)

y_pred= model_xgb.predict(X_test)

model_performance_metrics(model_xgb, X, X_test, X_train, y, y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier


estimators = [
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('svc', LinearSVC()),
    ('mlp', MLPClassifier()),
    ('nb', GaussianNB()),
    ('xgb', XGBClassifier())
]

model_se = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier()
)

model_se.fit(X_train, y_train)

y_pred = model_se.predict(X_test)

model_performance_metrics(model_se, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)