In [51]:
import sys
import pandas as pd
import numpy as np

# Define a random seed for reproducibility
seed = 0
np.random.seed(seed)

import sklearn
import xgboost
import keras
import tensorflow as tf
import ast
import os
from joblib import dump, load


print('Python: {}'.format(sys.version))
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))
print('Sklearn: {}'.format(sklearn.__version__))
print('XGBoost: {}'.format(xgboost.__version__))
print('Keras: {}'.format(keras.__version__))

Python: 3.6.10 |Anaconda, Inc.| (default, May  7 2020, 19:46:08) [MSC v.1916 64 bit (AMD64)]
Pandas: 1.0.3
Numpy: 1.18.1
Sklearn: 0.22.1
XGBoost: 1.1.1
Keras: 2.3.1


In [52]:
# import the processed dataset
df = pd.read_csv('../data/processed/NSQIP_Clean2.csv')
data = df
data = data.drop(columns=['index.1', 'index', 'Unnamed: 0']).copy()
# Replace missing values with median values
data['AGE'].replace(np.NaN, data['AGE'].median(), inplace=True)
data = data.dropna()

In [53]:
# create training and testing datasets
from sklearn.model_selection import train_test_split

features = data.drop(columns=['READMISSION1'])
labels = data.READMISSION1

X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 300, random_state = 0)

In [54]:
# load a list of all optimization results files
dirName = '../reports/optimization/'

fileList = list()
dirList = list()

for (dirpath, dirnames, filenames) in os. walk(dirName):
    for file in filenames:
        if '.csv' in file:
            fileList.append(os.path. join(dirpath, file))

In [55]:
def load_model(name, hyperparameters):
    
    """ Loads the appropriate sklearn model from a model name and hyperparameters """
    
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.svm import SVC
    from xgboost import XGBClassifier
    
    if name == 'AdaBoost':
        model = AdaBoostClassifier(**hyperparameters)
    elif name == 'DecisionTree':
        model = DecisionTreeClassifier(**hyperparameters)
    elif name == 'KMeans':
        model = KNeighborsClassifier(**hyperparameters)
    elif name == 'MLP':
        model = MLPClassifier(**hyperparameters)
    elif name == 'RandomForest':
        model = RandomForestClassifier(**hyperparameters)
    elif name == 'SVC':
        model = SVC(**hyperparameters, probability=True)
    elif name == 'XGBoost':
        model = XGBClassifier(**hyperparameters)
    else:
        print('Unkown model name')
        
    return model

In [56]:
import pickle

for file in fileList:
    #load hyperparameter optimization files
    results = pd.read_csv(file)
    
    # find the model_name from the filename
    file_split = file.split('/')[3]
    model_name = file_split.split('\\')[0]

    new_results = results.copy()

    # String to dictionary
    new_results['hyperparameters'] = new_results['hyperparameters'].map(ast.literal_eval)

    # Sort with best values on top
    new_results = new_results.sort_values('score', ascending = False).reset_index(drop = True)

    # Use best hyperparameters to create a model
    hyperparameters = new_results.loc[0, 'hyperparameters']
    
    # load the appropriate model and fit on training data
    model = load_model(model_name, hyperparameters)
    model.fit(X_train, Y_train)
    
    # create output filename
    new_dir = '../models/'
    filename = new_dir + model_name + '.sav'
    
    # save the trained model
    pickle.dump(model, open(filename, 'wb'))