NOTE NB-models don't give us feature_importances_
https://stackoverflow.com/questions/41592661/determining-the-most-contributing-features-for-svm-classifier-in-sklearn

NOTE we can include ELI5 for explanation of predictors
https://github.com/TeamHG-Memex/eli5

NOTE There are other explanation oriented libraries as well
https://github.com/DistrictDataLabs/yellowbrick


In [27]:
# Import the usual suspects.

from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from sklearn.model_selection import cross_val_score

from sklearn.metrics import classification_report, confusion_matrix




import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
sns.set_context('paper')



def print_ln():
    print('-' * 80, '\n')


In [28]:

def model_performance_metrics(model, X, X_test, X_train, y, y_test, y_pred, detailed= False, show_feature_importances= True):

    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print_ln()
    

    if show_feature_importances:

        feature_importances = pd.DataFrame(model.feature_importances_,
                                               index =  X_train.columns,
                                               columns=['importance']).sort_values('importance', ascending=False)



        print("=== Feature Importances ===")
        print(feature_importances)


    
    if detailed:
        model_score = cross_val_score(model, X, y, cv=10)


        print("=== Confusion Matrix ===")
        print(confusion_matrix(y_test, y_pred))
        print_ln()

        print("=== Classification Report ===")
        print(classification_report(y_test, y_pred))
        print_ln()

        print("=== All AUC Scores ===")
        print(model_score)

        print_ln()

        print("=== Mean AUC Score ===")
        print(model_score.mean())
        print_ln()





In [29]:
import json

with open("../data/raw/test_train_genome_ids.json") as f:
    all_genomes_ids_dict = json.load(f)

all_genomes_ids_dict.keys()

dict_keys(['tb_portals', 'htbc', 'final_tbportals_train_genomes', 'final_htbc_test_genomes'])

In [30]:
final_tbportals_train_genomes_ids = all_genomes_ids_dict['final_tbportals_train_genomes']
final_htbc_test_genomes_ids = all_genomes_ids_dict['final_htbc_test_genomes']


In [31]:
final_htbc_test_genomes_ids

['ERR3335735',
 'SRR8552929',
 'ERR067629',
 'ERR067714',
 'SRR5065314',
 'ERR067659',
 'ERR067590',
 'ERR688027',
 'ERR3335727',
 'ERR3335759',
 'ERR067602',
 'ERR047883',
 'ERR3335777',
 'ERR3335801',
 'SRR5065287',
 'SRR8552930',
 'ERR027467',
 'ERR3335774',
 'ERR067635',
 'ERR688020',
 'SRR5065410',
 'ERR3335756',
 'ERR3335802',
 'ERR067598',
 'ERR027465',
 'SRR8552927',
 'ERR688037',
 'SRR5065279',
 'ERR067596',
 'ERR3335772',
 'ERR3335740',
 'SRR5065376',
 'ERR067718',
 'ERR3335794',
 'ERR067738',
 'SRR5065276',
 'SRR5065230',
 'SRR5065258',
 'ERR067705',
 'SRR5065361',
 'ERR067576',
 'ERR688016',
 'ERR688022',
 'ERR067622',
 'SRR5065403',
 'ERR067586',
 'SRR11638839',
 'SRR5065231',
 'SRR5065281',
 'SRR5065214',
 'ERR067577',
 'SRR5065288',
 'SRR5065328',
 'SRR5065247',
 'ERR067703',
 'SRR5065292',
 'ERR3335780',
 'SRR5065250',
 'ERR3335798',
 'ERR067722',
 'SRR5065241',
 'ERR067653',
 'ERR3335785',
 'ERR688019',
 'ERR027459',
 'ERR047886',
 'SRR5065289',
 'ERR067715',
 'ERR0676

In [32]:
drugs_column_names = ['rifampicin',
                      'isoniazid',
                      'pyrazinamide',
                      'ethambutol',
                      'streptomycin',
                      'fluoroquinolones',
                      'moxifloxacin',
                      'ofloxacin',
                      'levofloxacin',
                      'ciprofloxacin',
                      'aminoglycosides',
                      'amikacin',
                      'kanamycin',
                      'capreomycin',
                      'ethionamide',
                      'para-aminosalicylic_acid',
                      'cycloserine',
                      'linezolid',
                      'bedaquiline',
                      'clofazimine',
                      'delamanid']


lineage_column_names = [ 'main_lin', 'sublin' ]

resistance_status_column_names = [ 'drtype', 'MDR', 'XDR', 'Resistance_Status' ]


renamed_drug_columns_names = [                     'rifampicin_resistance',
                                                   'isoniazid_resistance',
                                                   'pyrazinamide_resistance',
                                                   'ethambutol_resistance',
                                                   'streptomycin_resistance',
                                                   'fluoroquinolones_resistance',
                                                   'moxifloxacin_resistance',
                                                   'ofloxacin_resistance',
                                                   'levofloxacin_resistance',
                                                   'ciprofloxacin_resistance',
                                                   'aminoglycosides_resistance',
                                                   'amikacin_resistance',
                                                   'kanamycin_resistance',
                                                   'capreomycin_resistance',
                                                   'ethionamide_resistance',
                                                   'para-aminosalicylic_acid_resistance',
                                                   'cycloserine_resistance',
                                                   'linezolid_resistance',
                                                   'bedaquiline_resistance',
                                                   'clofazimine_resistance',
                                                   'delamanid_resistance']


renamed_drug_columns_names_dict = {
                         'rifampicin': 'rifampicin_resistance',
                         'isoniazid': 'isoniazid_resistance',
                         'pyrazinamide': 'pyrazinamide_resistance',
                         'ethambutol': 'ethambutol_resistance',
                         'streptomycin': 'streptomycin_resistance',
                         'fluoroquinolones': 'fluoroquinolones_resistance',
                         'moxifloxacin': 'moxifloxacin_resistance',
                         'ofloxacin': 'ofloxacin_resistance',
                         'levofloxacin': 'levofloxacin_resistance',
                         'ciprofloxacin': 'ciprofloxacin_resistance',
                         'aminoglycosides': 'aminoglycosides_resistance',
                         'amikacin': 'amikacin_resistance',
                         'kanamycin': 'kanamycin_resistance',
                         'capreomycin': 'capreomycin_resistance',
                         'ethionamide': 'ethionamide_resistance',
                         'para-aminosalicylic_acid': 'para-aminosalicylic_acid_resistance',
                         'cycloserine': 'cycloserine_resistance',
                         'linezolid': 'linezolid_resistance',
                         'bedaquiline': 'bedaquiline_resistance',
                         'clofazimine': 'clofazimine_resistance',
                         'delamanid': 'delamanid_resistance'
}

In [33]:
# mono_resistance_df_filledna = pd.read_csv("../data/processed/mono_resistance_df_filledna.csv").set_index('SampleID')

binarized_final_df = pd.read_csv("../data/processed/final.binarized_final_multilabel_df.csv").set_index('SampleID')


binarized_final_df.head()

Unnamed: 0_level_0,NC000962_3.22,NC000962_3.434,NC000962_3.524,NC000962_3.645,NC000962_3.648,NC000962_3.654,NC000962_3.666,NC000962_3.675,NC000962_3.678,NC000962_3.693,...,linezolid_resistance,bedaquiline_resistance,clofazimine_resistance,delamanid_resistance,main_lin,sublin,drtype,MDR,XDR,Resistance_Status
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR027458,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,lineage2,lineage2.2.1,MDR,R,,Resistant
ERR027459,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,lineage2,lineage2.2.1,Drug-resistant,,,Resistant
ERR027460,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,,,Sensitive,,,Sensitive
ERR027461,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,lineage2,lineage2.2.1,Sensitive,,,Sensitive
ERR027462,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,lineage2,lineage2.2.1,Sensitive,,,Sensitive


In [34]:
binarized_final_df.columns

Index(['NC000962_3.22', 'NC000962_3.434', 'NC000962_3.524', 'NC000962_3.645',
       'NC000962_3.648', 'NC000962_3.654', 'NC000962_3.666', 'NC000962_3.675',
       'NC000962_3.678', 'NC000962_3.693',
       ...
       'linezolid_resistance', 'bedaquiline_resistance',
       'clofazimine_resistance', 'delamanid_resistance', 'main_lin', 'sublin',
       'drtype', 'MDR', 'XDR', 'Resistance_Status'],
      dtype='object', length=52709)

In [35]:
binarized_final_df.index

Index(['ERR027458', 'ERR027459', 'ERR027460', 'ERR027461', 'ERR027462',
       'ERR027463', 'ERR027464', 'ERR027465', 'ERR027466', 'ERR027467',
       ...
       'SRR9738505', 'SRR9738506', 'SRR9738521', 'SRR9738526', 'SRR9738527',
       'SRR9738535', 'SRR9738538', 'SRR9738554', 'SRR9738556', 'SRR9738557'],
      dtype='object', name='SampleID', length=1725)

In [36]:
binarized_final_df= binarized_final_df.drop(columns=[*renamed_drug_columns_names, *lineage_column_names, 'drtype', 'MDR', 'XDR'], axis= 1)

binarized_final_df.head()

Unnamed: 0_level_0,NC000962_3.22,NC000962_3.434,NC000962_3.524,NC000962_3.645,NC000962_3.648,NC000962_3.654,NC000962_3.666,NC000962_3.675,NC000962_3.678,NC000962_3.693,...,NC000962_3.4410251,NC000962_3.4410260,NC000962_3.4410272,NC000962_3.4410278,NC000962_3.4410728,NC000962_3.4410850,NC000962_3.4411016,NC000962_3.4411170,NC000962_3.4411327,Resistance_Status
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR027458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Resistant
ERR027459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Resistant
ERR027460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sensitive
ERR027461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sensitive
ERR027462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sensitive


In [37]:
binarized_final_df.shape

(1725, 52683)

In [48]:
binarized_final_df['Resistance_Status']= binarized_final_df['Resistance_Status'].apply(lambda resistance: 0.0 if resistance == 'Sensitive' else 1.0)
binarized_final_df.head()

Unnamed: 0_level_0,NC000962_3.22,NC000962_3.434,NC000962_3.524,NC000962_3.645,NC000962_3.648,NC000962_3.654,NC000962_3.666,NC000962_3.675,NC000962_3.678,NC000962_3.693,...,NC000962_3.4410251,NC000962_3.4410260,NC000962_3.4410272,NC000962_3.4410278,NC000962_3.4410728,NC000962_3.4410850,NC000962_3.4411016,NC000962_3.4411170,NC000962_3.4411327,Resistance_Status
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR027458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
ERR027459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
ERR027460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
ERR027461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
ERR027462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [61]:


train = binarized_final_df.loc[final_tbportals_train_genomes_ids]
train.shape


(1223, 52683)

In [77]:
train.to_csv("../data/processed/final.train.tsv", "\t")
train.head()

Unnamed: 0_level_0,NC000962_3.22,NC000962_3.434,NC000962_3.524,NC000962_3.645,NC000962_3.648,NC000962_3.654,NC000962_3.666,NC000962_3.675,NC000962_3.678,NC000962_3.693,...,NC000962_3.4410251,NC000962_3.4410260,NC000962_3.4410272,NC000962_3.4410278,NC000962_3.4410728,NC000962_3.4410850,NC000962_3.4411016,NC000962_3.4411170,NC000962_3.4411327,Resistance_Status
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR10525336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
SRR10380004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
SRR6807701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
SRR11033700,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
SRR1163101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0


In [60]:


test = binarized_final_df.loc[final_htbc_test_genomes_ids]
test.shape


(501, 52683)

In [78]:
test.to_csv("../data/processed/final.test.tsv", "\t")
test.head()

Unnamed: 0_level_0,NC000962_3.22,NC000962_3.434,NC000962_3.524,NC000962_3.645,NC000962_3.648,NC000962_3.654,NC000962_3.666,NC000962_3.675,NC000962_3.678,NC000962_3.693,...,NC000962_3.4410251,NC000962_3.4410260,NC000962_3.4410272,NC000962_3.4410278,NC000962_3.4410728,NC000962_3.4410850,NC000962_3.4411016,NC000962_3.4411170,NC000962_3.4411327,Resistance_Status
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR3335735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
SRR8552929,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
ERR067629,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
ERR067714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
SRR5065314,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0


In [63]:
X_train = train.loc[:, train.columns != 'Resistance_Status']
y_train = train.loc[:, 'Resistance_Status']


In [64]:
X_train

Unnamed: 0_level_0,NC000962_3.22,NC000962_3.434,NC000962_3.524,NC000962_3.645,NC000962_3.648,NC000962_3.654,NC000962_3.666,NC000962_3.675,NC000962_3.678,NC000962_3.693,...,NC000962_3.4410242,NC000962_3.4410251,NC000962_3.4410260,NC000962_3.4410272,NC000962_3.4410278,NC000962_3.4410728,NC000962_3.4410850,NC000962_3.4411016,NC000962_3.4411170,NC000962_3.4411327
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR10525336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR10380004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR6807701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR11033700,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR1163101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR10379920,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR11033653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR1163029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SRR10379888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
X_test = test.loc[:, test.columns != 'Resistance_Status']
y_test = test.loc[:, 'Resistance_Status']


In [69]:
# NOTE: The dtype might be problematic but for now this works!

y_test

SampleID
ERR3335735    1.0
SRR8552929    1.0
ERR067629     1.0
ERR067714     1.0
SRR5065314    1.0
             ... 
SRR5065255    0.0
SRR5065290    0.0
SRR5065284    0.0
ERR067693     0.0
SRR5065384    0.0
Name: Resistance_Status, Length: 501, dtype: float64

In [22]:

# We have relied on the manual split

#X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                    stratify=y,
#                                                    train_size=0.7,
#                                                    test_size=0.3,
#                                                    random_state=100)


In [70]:
from sklearn.svm import LinearSVC

model_svm= LinearSVC()

model_svm.fit(X_train, y_train)

y_pred= model_svm.predict(X_test)


model_performance_metrics(model_svm, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)


Accuracy: 0.6606786427145709
Mean Absolute Error: 0.3393213572854291
Mean Squared Error: 0.3393213572854291
Root Mean Squared Error: 0.5825129674826383
-------------------------------------------------------------------------------- 



In [71]:
from sklearn.ensemble import RandomForestClassifier



model_rf= RandomForestClassifier(n_estimators= 100,
                                  random_state = 100,
                                  max_depth=5,
                                  min_samples_leaf=50,
                                  min_samples_split=50)

model_rf.fit(X_train, y_train)

y_pred= model_rf.predict(X_test)

model_performance_metrics(model_rf, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.6726546906187625
Mean Absolute Error: 0.3273453093812375
Mean Squared Error: 0.3273453093812375
Root Mean Squared Error: 0.5721409873285058
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.1189561    0.023672
NC000962_3.3502231    0.019193
NC000962_3.2866569    0.019173
NC000962_3.1638235    0.016459
NC000962_3.2592310    0.016125
...                        ...
NC000962_3.1439819    0.000000
NC000962_3.1439822    0.000000
NC000962_3.1439834    0.000000
NC000962_3.1439836    0.000000
NC000962_3.4411327    0.000000

[52682 rows x 1 columns]


In [72]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb= GradientBoostingClassifier(
                                     n_estimators= 100,
                                     random_state = 100,
                                     max_depth=5
                                    )

model_gb.fit(X_train, y_train)

y_pred= model_gb.predict(X_test)


model_performance_metrics(model_gb, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.6726546906187625
Mean Absolute Error: 0.3273453093812375
Mean Squared Error: 0.3273453093812375
Root Mean Squared Error: 0.5721409873285058
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.2439519    0.109826
NC000962_3.2591829    0.052245
NC000962_3.1533004    0.051178
NC000962_3.2300546    0.046073
NC000962_3.1090789    0.033321
...                        ...
NC000962_3.1441426    0.000000
NC000962_3.1441429    0.000000
NC000962_3.1441533    0.000000
NC000962_3.1441545    0.000000
NC000962_3.4411327    0.000000

[52682 rows x 1 columns]


In [73]:
from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import BernoulliNB

model_nb= GaussianNB()
# model_nb= BernoulliNB()

model_nb.fit(X_train, y_train)

y_pred= model_nb.predict(X_test)


model_performance_metrics(model_nb, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

Accuracy: 0.5209580838323353
Mean Absolute Error: 0.47904191616766467
Mean Squared Error: 0.47904191616766467
Root Mean Squared Error: 0.6921285402059827
-------------------------------------------------------------------------------- 



In [74]:
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(
                          solver='lbfgs',
                          alpha=1e-5,
                          hidden_layer_sizes=(5, 2),
                          random_state=1
)



model_mlp.fit(X_train, y_train)

y_pred= model_mlp.predict(X_test)

model_performance_metrics(model_mlp, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

Accuracy: 0.6606786427145709
Mean Absolute Error: 0.3393213572854291
Mean Squared Error: 0.3393213572854291
Root Mean Squared Error: 0.5825129674826383
-------------------------------------------------------------------------------- 



In [75]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
                          learning_rate= 0.01,
                          random_state= 1
)



model_xgb.fit(X_train, y_train)

y_pred= model_xgb.predict(X_test)

model_performance_metrics(model_xgb, X, X_test, X_train, y, y_test, y_pred)

Accuracy: 0.6227544910179641
Mean Absolute Error: 0.3772455089820359
Mean Squared Error: 0.3772455089820359
Root Mean Squared Error: 0.6142031496028296
-------------------------------------------------------------------------------- 

=== Feature Importances ===
                    importance
NC000962_3.2439519    0.082726
NC000962_3.1090783    0.045700
NC000962_3.1090789    0.041056
NC000962_3.1533004    0.038588
NC000962_3.1093928    0.031613
...                        ...
NC000962_3.1439767    0.000000
NC000962_3.1439768    0.000000
NC000962_3.1439776    0.000000
NC000962_3.1439777    0.000000
NC000962_3.4411327    0.000000

[52682 rows x 1 columns]


In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier


estimators = [
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('svm', LinearSVC()),
    ('mlp', MLPClassifier()),
    ('nb', GaussianNB()),
    ('xgb', XGBClassifier())
]

model_se = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier()
)

model_se.fit(X_train, y_train)

y_pred = model_se.predict(X_test)

model_performance_metrics(model_se, X, X_test, X_train, y, y_test, y_pred, show_feature_importances= False)

KeyboardInterrupt: 

## Grid search for each model