In [1]:
import sys
import pandas as pd
import numpy as np

# Define a random seed for reproducibility
seed = 0
np.random.seed(seed)

import sklearn
import xgboost
import keras
import tensorflow as tf

import os
import pickle


print('Python: {}'.format(sys.version))
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))
print('Sklearn: {}'.format(sklearn.__version__))
print('XGBoost: {}'.format(xgboost.__version__))
print('Keras: {}'.format(keras.__version__))

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")
Using TensorFlow backend.


Python: 3.6.10 |Anaconda, Inc.| (default, May  7 2020, 19:46:08) [MSC v.1916 64 bit (AMD64)]
Pandas: 1.0.3
Numpy: 1.18.1
Sklearn: 0.22.1
XGBoost: 1.1.1
Keras: 2.3.1


In [2]:
def load_train_and_test(path):

    """Loads training features, training labels, testing features, and testing features
    Parameters:
        path (str) -- a single directory path containing all four datasets
    """

    train_features = pd.read_csv(path + 'train_features.csv', index_col=0)
    train_labels = pd.read_csv(path + 'train_labels.csv', index_col=0)
    test_features = pd.read_csv(path + 'test_features.csv', index_col=0)
    test_labels = pd.read_csv(path + 'test_labels.csv', index_col=0)

    return train_features, train_labels.values.ravel(), test_features, test_labels.values.ravel()

X_train, Y_train, X_test, Y_test = load_train_and_test('../data/split/')

In [3]:
from sklearn.metrics import roc_auc_score

def permutation_importance(X, y, model): 
    
    """ Add random permutations to each variable in training data to assess variable importance """
    
    perm = {}
    y_true = model.predict_proba(X)[:,1]
    baseline= roc_auc_score(y, y_true)
    
    print('Baseline ROC AUC score: {}'.format(baseline))

    for cols in X.columns:
        new_df = X.copy()
        values = new_df.loc[:,cols]
        new_df.loc[:,cols] = np.random.permutation(values)
        y_true = model.predict_proba(new_df)[:,1]
        perm[cols] = roc_auc_score(y, y_true) - baseline
        
    return perm

In [4]:
# load all optimized models from the models folder

dirName = '../models/'

fileList = list()
dirList = list()

for (dirpath, dirnames, filenames) in os. walk(dirName):
    for file in filenames:
        if '.sav' in file:
            fileList.append(os.path. join(dirpath, file))

modelList = list()

for file in fileList:
    model = pickle.load(open(file, 'rb'))
    modelList.append(model)

In [5]:
# run permutation analysis for each variable and each model

permutations = []

for i,model in enumerate(modelList):
    perms = permutation_importance(X_train.copy(), Y_train.copy(), model)
    permutations.append(perms)
    
scores = pd.DataFrame(permutations, index = [file.split('/')[-1].rstrip('.sav') for file in fileList])
scores

Baseline ROC AUC score: 0.637720083557147
Baseline ROC AUC score: 0.7054878375379718
Baseline ROC AUC score: 0.5729671204156432
Baseline ROC AUC score: 1.0
Baseline ROC AUC score: 0.77980549242859
Baseline ROC AUC score: 0.7178951556749229
Baseline ROC AUC score: 0.9966332285043118


Unnamed: 0,SEX_male,RACE_NEW_American_Indian_or_Alaska_Native,RACE_NEW_Asian,RACE_NEW_Black_or_African_American,RACE_NEW_Native_Hawaiian_or_Pacific_Islander,RACE_NEW_Unknown_Not_Reported,RACE_NEW_White,ETHNICITY_HISPANIC_N,ETHNICITY_HISPANIC_U,ETHNICITY_HISPANIC_Y,...,TOTHLOS,PRSODM,PRBUN,PRCREAT,PRALBUM,PRPLATE,PRPTT,PRINR,PRPT,BMI
AdaBoost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.015487,-0.047391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DecisionTree,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.003772,-0.074758,0.0,0.0,0.0,0.0,-0.061141,0.0,0.0,0.0
GradientBoosting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.070817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MLP,-0.014776,-0.000604,-0.000245,-0.013528,0.0,-0.001415574,-0.024157,-0.007032,-0.001132,-0.001446,...,-0.025381,-0.091997,-0.032505,-0.073319,-0.030332,-0.033033,-0.019412,-0.011791,-0.009833,-0.058788
RandomForest,-0.003244,0.0,0.000482,0.000191,0.0,1.110223e-16,-0.001905,-0.000482,0.0,0.0,...,0.007216,-0.03982,-0.016176,-0.001569,-0.005555,-0.013919,-0.019443,-0.007254,0.0,-0.012342
SVC,0.00176,-0.000765,0.005127,-0.001201,-0.000145,-0.002815845,-8.4e-05,0.004752,-0.001201,0.002173,...,-0.005724,-0.008011,-0.034632,0.001087,0.001064,-0.003068,-0.015571,-0.005746,-0.000421,0.002318
XGBoost,-0.003397,-8e-06,-0.000551,-0.001645,0.0,-0.0007345683,-0.000298,-0.00039,-0.000551,3.8e-05,...,-0.014844,-0.019481,-0.012801,-0.013804,-0.002487,-0.032956,-0.018448,-0.006175,-3.1e-05,-0.065813


In [6]:
# print total permutation values for all optimized models

# Note - more negative values mean the algorithm was impacted significantly by permutations of this column (ie, this column is an important variable)
# Note - positive values may indicate the algorithm's performance is decreased by inclusion of this variable. 

totals = scores.sum(axis=0)
totals.sort_values()

RETURNOR_Yes                                             -0.384822
PRSODM                                                   -0.281458
BMI                                                      -0.134625
PRPTT                                                    -0.134017
TOTHLOS                                                  -0.128810
PRBUN                                                    -0.096114
PRCREAT                                                  -0.087605
PRPLATE                                                  -0.082976
OPTIME                                                   -0.063349
AGE                                                      -0.037792
PRALBUM                                                  -0.037310
PRINR                                                    -0.030967
RACE_NEW_White                                           -0.026444
ASACLAS_3_Severe_Disturb                                 -0.023277
HYPERMED_Yes                                             -0.02

In [7]:
# Print top 10 most important variables
totals.sort_values()[:10]

RETURNOR_Yes   -0.384822
PRSODM         -0.281458
BMI            -0.134625
PRPTT          -0.134017
TOTHLOS        -0.128810
PRBUN          -0.096114
PRCREAT        -0.087605
PRPLATE        -0.082976
OPTIME         -0.063349
AGE            -0.037792
dtype: float64