In [1]:

# Import the usual suspects.

from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix




import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
sns.set_context('paper')



def print_ln():
    print('-' * 80, '\n')


In [2]:

def model_performance_metrics(model, X, X_test, X_train, y, y_test, y_pred, detailed= False):

    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print_ln()
    
    feature_importances = pd.DataFrame(model.feature_importances_,
                                       index =  X_train.columns,
                                       columns=['importance']).sort_values('importance', ascending=False)


    print("=== Feature Importances ===")
    print(feature_importances)


    
    if detailed:
        model_score = cross_val_score(model, X, y, cv=10)


        print("=== Confusion Matrix ===")
        print(confusion_matrix(y_test, y_pred))
        print_ln()

        print("=== Classification Report ===")
        print(classification_report(y_test, y_pred))
        print_ln()

        print("=== All AUC Scores ===")
        print(model_score)

        print_ln()

        print("=== Mean AUC Score ===")
        print(model_score.mean())
        print_ln()





In [4]:
multi_resistance_df_filledna = pd.read_csv("../data/processed/multi_resistance_filledna_df.csv")

multi_resistance_df_filledna.head()

Unnamed: 0.1,Unnamed: 0,SampleID,NC000962_3.78,NC000962_3.80,NC000962_3.102,NC000962_3.104,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,...,amikacin_resistance,kanamycin_resistance,capreomycin_resistance,ethionamide_resistance,para-aminosalicylic_acid_resistance,cycloserine_resistance,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance
0,0,ERR3129939,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,ERR3148148,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,ERR3148149,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,ERR3148151,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,ERR3148153,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
multi_resistance_df_filledna = multi_resistance_df_filledna.drop(columns= ['Unnamed: 0']).set_index('SampleID')
multi_resistance_df_filledna.head()

Unnamed: 0_level_0,NC000962_3.78,NC000962_3.80,NC000962_3.102,NC000962_3.104,NC000962_3.117,NC000962_3.120,NC000962_3.135,NC000962_3.138,NC000962_3.150,NC000962_3.155,...,amikacin_resistance,kanamycin_resistance,capreomycin_resistance,ethionamide_resistance,para-aminosalicylic_acid_resistance,cycloserine_resistance,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3129939,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148148,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148149,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148151,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERR3148153,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# X = multi_resistance_df_filledna.loc[:, multi_resistance_df_filledna.columns != 'isResistant']
# y = multi_resistance_df_filledna.loc[:, 'isResistant']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=100)



In [23]:
from sklearn.ensemble import RandomForestClassifier



model_rf= RandomForestClassifier(n_estimators= 100,
                                  random_state = 100,
                                  max_depth=5,
                                  min_samples_leaf=50,
                                  min_samples_split=50)

model_rf.fit(X_train, y_train)

y_pred= model_rf.predict(X_test)



In [27]:
model_performance_metrics(model_rf, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.8021978021978022
Mean Absolute Error: 0.1978021978021978
Mean Squared Error: 0.1978021978021978
Root Mean Squared Error: 0.4447495899966607
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.3478244      0.1875
NC000962_3.1480184      0.1250
NC000962_3.1480945      0.0625
NC000962_3.3478245      0.0625
NC000962_3.3843572      0.0625
...                        ...
NC000962_3.1398290      0.0000
NC000962_3.1398287      0.0000
NC000962_3.1398271      0.0000
NC000962_3.1398251      0.0000
NC000962_3.4411245      0.0000

[118668 rows x 1 columns]


In [28]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb= GradientBoostingClassifier(
                                     n_estimators= 100,
                                     random_state = 100,
                                     max_depth=5
                                    )

model_gb.fit(X_train, y_train)

y_pred= model_gb.predict(X_test)



In [29]:
model_performance_metrics(model_gb, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.7692307692307693
Mean Absolute Error: 0.23076923076923078
Mean Squared Error: 0.23076923076923078
Root Mean Squared Error: 0.4803844614152614
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.1637145    0.057574
NC000962_3.3750587    0.040738
NC000962_3.3735813    0.039937
NC000962_3.1637035    0.039575
NC000962_3.3941568    0.031643
...                        ...
NC000962_3.1400440    0.000000
NC000962_3.1400437    0.000000
NC000962_3.1400436    0.000000
NC000962_3.1400423    0.000000
NC000962_3.4411245    0.000000

[118668 rows x 1 columns]


In [31]:
from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import BernoullliNB

model_nb= GaussianNB()

model_nb.fit(X_train, y_train)

y_pred= model_nb.predict(X_test)



In [None]:
model_performance_metrics(model_nb, X, X_test, X_train, y, y_test, y_pred)

In [33]:
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(
                          solver='lbfgs', 
                          alpha=1e-5,
                          hidden_layer_sizes=(5, 2), 
                          random_state=1
)



model_mlp.fit(X_train, y_train)

y_pred= model_mlp.predict(X_test)



In [34]:
model_performance_metrics(model_mlp, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.7472527472527473
Mean Absolute Error: 0.25274725274725274
Mean Squared Error: 0.25274725274725274
Root Mean Squared Error: 0.5027397465361703
=== Confusion Matrix ===
[[ 5 13]
 [10 63]]
-------------------------------------------------------------------------------- 

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.33      0.28      0.30        18
           1       0.83      0.86      0.85        73

    accuracy                           0.75        91
   macro avg       0.58      0.57      0.57        91
weighted avg       0.73      0.75      0.74        91

-------------------------------------------------------------------------------- 

=== All AUC Scores ===
[0.80645161 0.8        0.8        0.8        0.8        0.8
 0.8        0.8        0.8        0.8       ]
-------------------------------------------------------------------------------- 

=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.800645161

In [5]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
                          learning_rate= 0.01, 
                          random_state= 1
)



model_xgb.fit(X_train, y_train)

y_pred= model_xgb.predict(X_test)


In [8]:
model_performance_metrics(model_xgb, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.7912087912087912
Mean Absolute Error: 0.2087912087912088
Mean Squared Error: 0.2087912087912088
Root Mean Squared Error: 0.4569367667316877
=== Confusion Matrix ===
[[ 4 14]
 [ 5 68]]
-------------------------------------------------------------------------------- 

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.44      0.22      0.30        18
           1       0.83      0.93      0.88        73

    accuracy                           0.79        91
   macro avg       0.64      0.58      0.59        91
weighted avg       0.75      0.79      0.76        91

-------------------------------------------------------------------------------- 

=== All AUC Scores ===
[0.70967742 0.76666667 0.8        0.8        0.86666667 0.86666667
 0.9        0.83333333 0.73333333 0.36666667]
-------------------------------------------------------------------------------- 

=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7643

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier




estimators = [
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('svc', LinearSVC()),
    ('mlp', MLPClassifier()),
    ('nb', GaussianNB()),
    ('xgb', XGBClassifier())
]

model_se = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier()
)

model_se.fit(X_train, y_train)

y_pred = model_se.predict(X_test)

In [12]:
model_performance_metrics(model_se, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.7802197802197802
Mean Absolute Error: 0.21978021978021978
Mean Squared Error: 0.21978021978021978
Root Mean Squared Error: 0.4688072309384954


KeyboardInterrupt: 