# Analysis: Machine Learning Approaches
**Primary Analyst:** Alvin Jeffery

In [159]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn import preprocessing, model_selection
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, confusion_matrix, f1_score, roc_auc_score

from hpsklearn import HyperoptEstimator, standard_scaler
from hpsklearn import random_forest, extra_trees, gradient_boosting, xgboost_classification
from hyperopt import tpe, hp

### Load Data

In [103]:
train = pd.read_csv('../data/train_imputed.csv').drop(columns='Unnamed: 0')
valid = pd.read_csv('../data/valid_imputed.csv').drop(columns='Unnamed: 0')
test = pd.read_csv('../data/test_imputed.csv').drop(columns='Unnamed: 0')

In [104]:
y_train = train.pop('readmit_30d')
X_train = train.copy()
X_train.shape

(12912, 208)

In [105]:
y_valid = valid.pop('readmit_30d')
X_valid = valid.copy()
X_valid.shape

(3992, 208)

In [106]:
y_test = test.pop('readmit_30d')
X_test = test.copy()
X_test.shape

(4229, 208)

### Coerce Categories (`sex` and `race`)

In [107]:
X_train = pd.get_dummies(X_train, columns=['sex', 'race'])
X_train.head()

Unnamed: 0,stay_length,n_transfers,cpt_anesthesia,cpt_eval_manage,cpt_expired,cpt_medicine,cpt_modifier,cpt_path_lab,cpt_radiology,cpt_surgery,...,sex_F,sex_M,sex_U,race_A,race_B,race_H,race_I,race_N,race_U,race_W
0,4,2,0,6,4,16,9,24,9,2,...,1,0,0,0,0,0,0,0,0,1
1,3,3,0,2,1,16,15,39,1,0,...,1,0,0,0,0,0,0,0,0,1
2,1,1,0,3,0,7,4,13,1,0,...,1,0,0,0,0,0,0,0,0,1
3,4,2,0,6,1,4,8,47,2,2,...,1,0,0,0,0,0,0,0,0,1
4,1,1,0,1,1,2,3,21,1,0,...,1,0,0,0,0,0,0,0,0,1


In [108]:
X_valid = pd.get_dummies(X_valid, columns=['sex', 'race'])
X_test = pd.get_dummies(X_test, columns=['sex', 'race'])

## Hyperparameter Tuning with Hyperopt

In [109]:
# convert to numpy arrays
X_train = X_train.values
y_train = y_train.values

In [124]:
X_valid = X_valid.values
y_valid = y_valid.values

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [161]:
seed = 42
jobs = 3

clf = hp.pchoice('readmissions', 
          [(0.25, random_forest('readmissions.random_forest', 
                                n_jobs=jobs, random_state=seed)),
           (0.25, gradient_boosting('readmission.gbc',
                                random_state=seed)), # n_jobs not an argument
           (0.25, xgboost_classification('readmissions.xgb',
                                random_state=seed)), # n_jobs not an argument in hyperopt (unlike sklearn)
           (0.25, extra_trees('readmissions.extra_trees',
                                n_jobs=jobs, random_state=seed))])

In [198]:
timeout = None

estim = HyperoptEstimator(algo=tpe.suggest,
                         preprocessing=[standard_scaler('standard_scaler')],
                         classifier=clf,
                         max_evals=3,
                         trial_timeout=timeout)

In [None]:
estim.fit(X_train, y_train)

In [None]:
model = estim.best_model()
model

In [None]:
predictions = estim.predict(X_valid)

score = estim.score(X_valid, y_valid)
score

In [None]:
f1_score(y_valid, predictions)

In [None]:
roc_auc_score(y_valid, predictions)

In [None]:
confusion_matrix(y_valid, predictions)

## Test Set Performance (Performed Only Once)

In [None]:
## refit using best model but on entire train & validation sets together

## Interpretation

In [None]:
def feature_importance(rf, train, pred_cols):
    importances = rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]
    predictors = train[pred_cols].columns
    X = train[pred_cols]

    # Print the feature ranking
    print("Feature ranking:")
    #for f in range(X.shape[1]): # all features
    for f in range(5): # top 5 only
        print("%d. %s (%f)" % (f + 1, predictors[indices[f]], importances[indices[f]]))
    
    return X, importances, std, predictors, indices

In [None]:
x, importances, std, predictors, indices = feature_importance(rf=rf, train=X_train, pred_cols=X_train.columns)

plt.figure(figsize=(20,10)); plt.title("Feature importances")
plt.bar(range(x.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(x.shape[1]), predictors[indices], rotation=80); plt.xlim([-1, x.shape[1]])
plt.show()

In [None]:
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence

In [None]:
gbc = GradientBoostingClassifier(n_estimators=1000, 
                                 max_features=100, 
                                 random_state=42,
                                 warm_start=True)
gbc.fit(X_train, y_train)

In [None]:
predictions = gbc.predict(X_valid)
f1_score(y_valid, predictions)

In [None]:
roc_auc_score(y_valid, predictions)

In [None]:
plt.rcParams["figure.figsize"] = (15, 15)

var = 'age'
idx = X_train.columns.get_loc(var)

my_plot, ax = plot_partial_dependence(gbc, features=[idx], X=X_train)
axes = plt.gca()
axes.set_xlim([min(X_train[var]), max(X_train[var])])
my_plot.suptitle('Partial dependence on ' + var)