# The ML Pipeline for a Model to Predict Length of Hospital Delivery Stay

In [11]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import matplotlib
from math import ceil
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import PredefinedSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ParameterGrid

from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost

# Gather Data

In [12]:
# specifying data types for the columns to maintain formatting from original data
data_types = {
    'hospital_service_area': object, 
    'hospital_county': object,
    'operating_certificate_number': object, 
    'permanent_facility_id': object,
    'facility_name': object, 
    'age_group': object, 
    'zip_code_3_digits': object, 
    'gender': object, 
    'race': object,
    'ethnicity': object, 
    'payment_typology_1': object, 
    'payment_typology_2': object,
    'payment_typology_3': object, 
    'length_of_stay': int
}

In [13]:
all_visits = pd.read_csv('../data/planned_deliveries.csv', dtype=data_types)
all_visits = all_visits.loc[:, all_visits.columns != 'Unnamed: 0']
y = all_visits['length_of_stay']
X = all_visits.loc[:, all_visits.columns != 'length_of_stay']

# Compute Baseline Score

In [14]:
mean_length_of_stay = np.mean(y)
median_length_of_stay = np.median(y)
print('Mean length of stay:', mean_length_of_stay)
print('Median length of stay:', median_length_of_stay)

Mean length of stay: 2.3057860029780897
Median length of stay: 2.0


#### R^2 [dimensionless]  
**Pretty sure this is unnecessary to do though since by definition R^2 should = 0 for the expected (average) guess of y**

In [15]:
y_pred_mean = pd.Series([mean_length_of_stay]*len(y))

In [16]:
baseline_r2 = r2_score(y, y_pred_mean)
print('Baseline R^2 [dimensionless]:', baseline_r2)

Baseline R^2 [dimensionless]: 0.0


# Split, Train, and Cross Validate - R^2 Evaluation Metric

### Set up functions for automated pipeline

In [17]:
def stratified_continuous_split(X:pd.DataFrame, y:pd.Series, train_size:float, val_size:float, test_size:float, random_state:int):
    '''
    Performs a stratified split of inputted data (with respect to y) into a training set, validation set, and test set to specified percentages 
    of the data using verstack's scsplit and performs basic error checking.

    Parameters:
    - X: a 2D pandas DataFrame, the feature matrix
    - y: a 1D pandas Series, the target variable matrix matching X
    - train_size: a float between 0 and 1, the percentage of X which should be training data
    - val_size: a float between 0 and 1, the percentage of X which should be reserved for validation
    - test_size: a float between 0 and 1, the percentage of X which should be reserved for final testing
    - random_state: an int, the random state to split with
    Note: The sum of train_size + val_size + test_size must be 1.0 (100% of X).

    Returns:
    - (X_train) a 2D pandas DataFrame, the feature matrix of training data
    - (y_train) a 1D pandas Series, the target variable matrix for training data
    - (X_val) a 2D pandas DataFrame, the feature matrix of validation data
    - (y_val) a 1D pandas Series, the target variable matrix for validation data
    - (X_test) a 2D pandas DataFrame, the feature matrix of testing data
    - (y_test) a 1D pandas Series, the target variable matrix for testing data

    Raises:
    - ValueError for invalid input
    '''
    from verstack.stratified_continuous_split import scsplit
    
    if ((train_size + val_size + test_size) != 1):
        raise ValueError('Your train_size + val_size + test_size must add up to 1 (100%)!')
    if (not isinstance(random_state, int)):
        raise ValueError('Your random_state must be an int!')

    X_train, X_other, y_train, y_other = scsplit(X, y, stratify=y, test_size=(1-train_size), random_state=random_state)
    
    X_len = X.shape[0]
    test_percent_of_other = (test_size * X_len)/(X_len - (train_size * X_len))
    X_other = X_other.reset_index(drop=True)
    y_other = y_other.reset_index(drop=True)
    
    X_val, X_test, y_val, y_test = scsplit(X_other, y_other, stratify=y_other, test_size=test_percent_of_other, random_state=random_state)

    # basic error checking to check that split returned train, val, and test of expected sizes
    train_count_low = (int)(train_size * X_len)
    train_count_high = ceil(train_size * X_len)
    val_count_low = (int)(val_size * X_len)
    val_count_high = ceil(val_size * X_len)
    test_count_low = (int)(test_size * X_len)
    test_count_high = ceil(test_size * X_len)
    
    Xtrain_fin = X_train.shape[0]
    ytrain_fin = y_train.shape[0]
    Xval_fin = X_val.shape[0]
    yval_fin = y_val.shape[0]
    Xtest_fin = X_test.shape[0]
    ytest_fin = y_test.shape[0]
    
    if not (((Xtrain_fin == train_count_low) or (Xtrain_fin == train_count_high)) and ((ytrain_fin == train_count_low) or (ytrain_fin == train_count_high))):
        raise ValueError(f'Training set size should be approx. {train_size * X_len}, instead is: {X_train.shape[0]}')
    if not (((Xval_fin == val_count_low) or (Xval_fin == val_count_high)) and ((yval_fin == val_count_low) or (yval_fin == val_count_high))):
        raise ValueError(f'Validation set size should be approx. {val_size * X_len}, instead is: {X_val.shape[0]}')
    if not (((Xtest_fin == test_count_low) or (Xtest_fin == test_count_high)) and ((ytest_fin == test_count_low) or (ytest_fin == test_count_high))):
        raise ValueError(f'Test set size should be approx. {test_size * X_len}, instead is: {X_test.shape[0]}')

    return X_train, y_train, X_val, y_val, X_test, y_test

In [18]:
def MLpipe_Stratified_Continous_r2(X, y, preprocessor, ML_algo, param_grid, xgb=False):
    '''
    This function splits the data to train, validation, and test (60/20/20).
    The R^2 is maximized in cross-validation.
    
    This function:
    1. Loops through 10 different random states
    2. Splits the data 60/20/20.
    3. Fits a model with the predefined Preprocessor, trains the model with each hyperparameter combination in param_grid
    4. Calculates the model's error on the test set on the model with the best hyperparameter combinations in param_grid
    5. Returns a list of 10 test scores and 10 best models
    '''
    
    # lists to be returned
    test_scores = []
    best_models = []
    test_Xs = []
    test_ys = []

    nr_states = 10
    for i in range(nr_states):
        rs = 28 * i
        print('Random State:', rs)

        # split
        X_train, y_train, X_val, y_val, X_test, y_test = stratified_continuous_split(X, y, train_size=0.6, val_size=0.2, test_size=0.2, random_state=rs)

        # preprocess
        X_train_prep = preprocessor.fit_transform(X_train)
        X_val_prep = preprocessor.transform(X_val)
        X_test_prep = preprocessor.transform(X_test)

        # final preprocess with Standard Scaler so that I can use the coefficients of linear models as global importance metrics
        final_scaler = StandardScaler()
        X_train_prep = final_scaler.fit_transform(X_train_prep)
        X_val_prep = final_scaler.transform(X_val_prep)
        X_test_prep = final_scaler.transform(X_test_prep)

        test_Xs.append(X_test_prep)
        test_ys.append(y_test)

        # train and perform cross-validation        
        models = []
        val_scores = []
        for p in range(len(ParameterGrid(param_grid))):
            params = ParameterGrid(param_grid)[p]
            # print(' ',params) # TEMPORARY

            if (xgb):
                clf = ML_algo.fit(X_train_prep, y_train, early_stopping_rounds=50, eval_set=[(X_val_prep, y_val)], verbose=False)
            else:
                clf = ML_algo.fit(X_train_prep, y_train)
            models.append(clf)
            y_val_pred = clf.predict(X_val_prep)
            val_scores.append(r2_score(y_val, y_val_pred))
            # print(' Validation R^2:', val_scores[-1]) # TEMPORARY

        # save results
        print('    Best Model Parameters:', ParameterGrid(param_grid)[np.argmax(val_scores)])
        print('    Validation Set R^2:', np.max(val_scores))
        best_model_this_rs = models[np.argmax(val_scores)]
        best_models.append(best_model_this_rs)
        y_test_pred = best_model_this_rs.predict(X_test_prep)
        test_score = r2_score(y_test, y_test_pred)
        test_scores.append(test_score)
        print('    Baseline R^2 (test set):', r2_score(y_test, pd.Series([2]*len(y_test))))
        print('    Test Set R^2:', test_score)
        
    return test_scores, best_models, test_Xs, test_ys

### Run and Cross Validate Several Models - R^2

In [19]:
def unique_non_nans(X, ftr):
    '''
    Determines the unique, non-nan values of a certain feature in a feature matrix.
    Does not error check that ftr is in fact a column in X; this is the responsibility of the caller.
    
    - X: a 2D DataFrame feature matrix containing the feature of interest as a column
    - ftr: a string, the name of the feature to explore in X
    
    Returns:
    - (unique_vals_nonull) a list of the unique, non-null values of ftr in X, replacing a nan value with 'not reported'
    '''
    unique_vals = X[ftr].unique()
    unique_vals_nonull = ['not reported' if isinstance(x, float) and np.isnan(x) else x for x in unique_vals]

    return unique_vals_nonull

In [20]:
random_state = 42

# categorizing the columns in my dataset by how they should be encoded
onehot_ftrs = ['hospital_service_area', 'hospital_county', 'operating_certificate_number', 'permanent_facility_id', \
               'facility_name', 'zip_code_3_digits', 'gender', 'race', 'ethnicity', 'payment_typology_1', \
               'payment_typology_2', 'payment_typology_3']
onehot_cats = [unique_non_nans(X, ftr) for ftr in onehot_ftrs]
ordinal_ftrs = ['age_group']
ordinal_cats = [['0 to 17', '18 to 29', '30 to 49', '50 to 69', '70 or Older']]

# replace missing values in categorical columns with 'not reported'
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='not reported')),
    ('onehot', OneHotEncoder(categories=onehot_cats, sparse=False, handle_unknown='ignore'))])

# my data has no missing values in its ordinal column, so only encoding is necessary
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', categorical_transformer, onehot_ftrs),
        ('ordinal', ordinal_transformer, ordinal_ftrs)])

In [21]:
# keeping track of scores
models_r2s = pd.DataFrame(columns=['R^2', 'l1', 'l2', 'elastic net', 'random forest', 'SVR', 'XGBoost'])
models_r2s['R^2'] = pd.Series(['mean', 'std dev'])
models_r2s = models_r2s.set_index('R^2')

#### (1) Linear Regression with l1 Regularization

In [23]:
lin_reg_l1 = Lasso(random_state=random_state)
l1_params = {
    'lasso__alpha': np.logspace(math.exp(-2), math.exp(2), 21)
}
l1_test_scores_r2, l1_best_models_r2, l1_test_Xs_r2, l1_test_ys_r2 = MLpipe_Stratified_Continous_r2(X, y, preprocessor=preprocessor, ML_algo=lin_reg_l1, param_grid=l1_params)

Random State: 0
    Best Model Parameters: {'lasso__alpha': 1.3656370263406605}
    Validation Set R^2: -1.7243285062029656e-06
    Baseline R^2 (test set): -0.11625610687625354
    Test Set R^2: -4.902548941032592e-06
Random State: 28
    Best Model Parameters: {'lasso__alpha': 1.3656370263406605}
    Validation Set R^2: -4.886252826352688e-06
    Baseline R^2 (test set): -0.1161236477394243
    Test Set R^2: -1.7334713271477398e-06
Random State: 56
    Best Model Parameters: {'lasso__alpha': 1.3656370263406605}
    Validation Set R^2: -5.33421638748699e-06
    Baseline R^2 (test set): -0.11647686627314813
    Test Set R^2: -2.9463824837705488e-06
Random State: 84
    Best Model Parameters: {'lasso__alpha': 1.3656370263406605}
    Validation Set R^2: -2.946382483548504e-06
    Baseline R^2 (test set): -0.11586967058288655
    Test Set R^2: -5.33421638748699e-06
Random State: 112
    Best Model Parameters: {'lasso__alpha': 1.3656370263406605}
    Validation Set R^2: -1.0705787165665726

In [24]:
print('****Metrics with l1 Linear Regression:****')
mean = np.mean(l1_test_scores_r2)
std = np.std(l1_test_scores_r2)
models_r2s['l1']['mean'] = mean
models_r2s['l1']['std dev'] = std
print('Mean R^2:', mean)
print('Std. Deviation of R^2:', std)

****Metrics with l1 Linear Regression:****
Mean R^2: -2.526256760426726e-06
Std. Deviation of R^2: 1.7039580156994611e-06


In [25]:
file = open('../results/l1_r2.save', 'wb')
best_model_index = np.argmax(l1_test_scores_r2)
pickle.dump((l1_best_models_r2[best_model_index], l1_test_Xs_r2[best_model_index], l1_test_ys_r2[best_model_index]), file)
file.close()

#### (2) Linear Regression with l2 Regularization

In [26]:
lin_reg_l2 = Ridge(random_state=random_state)
l2_params = {
    'ridge__alpha': np.logspace(math.exp(-2), math.exp(2), 21)
}
l2_test_scores_r2, l2_best_models_r2, l2_test_Xs_r2, l2_test_ys_r2 = MLpipe_Stratified_Continous_r2(X, y, preprocessor=preprocessor, ML_algo=lin_reg_l2, param_grid=l2_params)

Random State: 0
    Best Model Parameters: {'ridge__alpha': 1.3656370263406605}
    Validation Set R^2: 0.06471723236822136
    Baseline R^2 (test set): -0.11434093353819197
    Test Set R^2: 0.06164090353931151
Random State: 28
    Best Model Parameters: {'ridge__alpha': 1.3656370263406605}
    Validation Set R^2: 0.061598832799795944
    Baseline R^2 (test set): -0.1161236477394243
    Test Set R^2: 0.0785706797893061
Random State: 56
    Best Model Parameters: {'ridge__alpha': 1.3656370263406605}
    Validation Set R^2: 0.06993506278140049
    Baseline R^2 (test set): -0.11647686627314813
    Test Set R^2: 0.05839005069610581
Random State: 84
    Best Model Parameters: {'ridge__alpha': 1.3656370263406605}
    Validation Set R^2: 0.07857378109507407
    Baseline R^2 (test set): -0.11586967058288655
    Test Set R^2: 0.07372872914355821
Random State: 112
    Best Model Parameters: {'ridge__alpha': 1.3656370263406605}
    Validation Set R^2: 0.05884575800942471
    Baseline R^2 (test s

In [27]:
print('****Metrics with l2 Linear Regression:****')
mean = np.mean(l2_test_scores_r2)
std = np.std(l2_test_scores_r2)
models_r2s['l2']['mean'] = mean
models_r2s['l2']['std dev'] = std
print('Mean R^2:', mean)
print('Std. Deviation of R^2:', std)

****Metrics with l2 Linear Regression:****
Mean R^2: 0.0679131157829063
Std. Deviation of R^2: 0.006687032990911035


In [28]:
file = open('../results/l2_r2.save', 'wb')
best_model_index = np.argmax(l2_test_scores_r2)
pickle.dump((l2_best_models_r2[best_model_index], l2_test_Xs_r2[best_model_index], l2_test_ys_r2[best_model_index]), file)
file.close()

#### (3) Linear Regression with Elastic Net Regularization

In [29]:
lin_reg_elastic = ElasticNet(random_state=random_state)
elastic_params = {
    'elasticnet__alpha': np.logspace(math.exp(-2), math.exp(2), 21),
    'elasticnet__l1_ratio': np.linspace(0, 1, 21)
}
elastic_test_scores_r2, elastic_best_models_r2, elastic_test_Xs_r2, elastic_test_ys_r2 = MLpipe_Stratified_Continous_r2(X, y, preprocessor=preprocessor, ML_algo=lin_reg_elastic, param_grid=elastic_params)

Random State: 0
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 1.3656370263406605}
    Validation Set R^2: -6.4004051214539e-06
    Baseline R^2 (test set): -0.11434093353819197
    Test Set R^2: -1.0221970248025514e-06
Random State: 28
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 1.3656370263406605}
    Validation Set R^2: -4.886252826352688e-06
    Baseline R^2 (test set): -0.1161236477394243
    Test Set R^2: -1.7334713271477398e-06
Random State: 56
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 1.3656370263406605}
    Validation Set R^2: -5.33421638748699e-06
    Baseline R^2 (test set): -0.11647686627314813
    Test Set R^2: -2.9463824837705488e-06
Random State: 84
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 1.3656370263406605}
    Validation Set R^2: -2.946382483548504e-06
    Baseline R^2 (test set): -0.11586967058288655
    Test Set R^2: -5.3342163874

In [30]:
print('****Metrics with Elastic Net Linear Regression:****')
mean = np.mean(elastic_test_scores_r2)
std = np.std(elastic_test_scores_r2)
models_r2s['elastic net']['mean'] = mean
models_r2s['elastic net']['std dev'] = std
print('Mean R^2:', mean)
print('Std. Deviation of R^2:', std)

****Metrics with Elastic Net Linear Regression:****
Mean R^2: -2.138221568803722e-06
Std. Deviation of R^2: 1.553848359847341e-06


In [31]:
file = open('../results/elastic_net_r2.save', 'wb')
best_model_index = np.argmax(elastic_test_scores_r2)
pickle.dump((elastic_best_models_r2[best_model_index], elastic_test_Xs_r2[best_model_index], elastic_test_ys_r2[best_model_index]), file)
file.close()

#### (4) Random Forest Regressor

In [32]:
random_forest_reg = RandomForestRegressor(n_jobs=-1, n_estimators=100, random_state=random_state)
rf_params = {
    'randomforestregressor__max_features': [1, 3, 10, 30],
    'randomforestregressor__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
rf_test_scores_r2, rf_best_models_r2, rf_test_Xs_r2, rf_test_ys_r2 = MLpipe_Stratified_Continous_r2(X, y, preprocessor=preprocessor, ML_algo=random_forest_reg, param_grid=rf_params)

Random State: 0
    Best Model Parameters: {'randomforestregressor__max_features': 3, 'randomforestregressor__max_depth': 1}
    Validation Set R^2: -0.13413756568442725
    Baseline R^2 (test set): -0.11434093353819197
    Test Set R^2: -0.09077444588413597
Random State: 28
    Best Model Parameters: {'randomforestregressor__max_features': 1, 'randomforestregressor__max_depth': 1}
    Validation Set R^2: -0.07591650909353254
    Baseline R^2 (test set): -0.1161236477394243
    Test Set R^2: -0.10016610743719312
Random State: 56
    Best Model Parameters: {'randomforestregressor__max_features': 1, 'randomforestregressor__max_depth': 1}
    Validation Set R^2: -0.07357441639835116
    Baseline R^2 (test set): -0.11647686627314813
    Test Set R^2: -0.09258926039786441
Random State: 84
    Best Model Parameters: {'randomforestregressor__max_features': 30, 'randomforestregressor__max_depth': 1}
    Validation Set R^2: -0.08381318868856558
    Baseline R^2 (test set): -0.11586967058288655


In [33]:
print('****Metrics with Random Forest Regressor:****')
mean = np.mean(rf_test_scores_r2)
std = np.std(rf_test_scores_r2)
models_r2s['random forest']['mean'] = mean
models_r2s['random forest']['std dev'] = std
print('Mean R^2:', mean)
print('Std. Deviation of R^2:', std)

****Metrics with Random Forest Regressor:****
Mean R^2: -0.08788367780531763
Std. Deviation of R^2: 0.013756327613600423


In [34]:
file = open('../results/random_forest_regressor_r2.save', 'wb')
best_model_index = np.argmax(rf_test_scores_r2)
pickle.dump((rf_best_models_r2[best_model_index], rf_test_Xs_r2[best_model_index], rf_test_ys_r2[best_model_index]), file)
file.close()

#### (5) SVR

In [35]:
svr = SVR()
svr_params = {
    'svr__gamma': [1e-3, 1e-1, 1e1, 1e3, 1e5],
    'svr__C': [1e-1, 1e0, 1e1]
}
svr_test_scores_r2, svr_best_models_r2, svr_test_Xs_r2, svr_test_ys_r2 = MLpipe_Stratified_Continous_r2(X, y, preprocessor=preprocessor, ML_algo=svr, param_grid=svr_params)

Random State: 0
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set R^2: 0.015209648358919248
    Baseline R^2 (test set): -0.11434093353819197
    Test Set R^2: -0.000870916134469546
Random State: 28
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set R^2: -0.0004941910910329916
    Baseline R^2 (test set): -0.1161236477394243
    Test Set R^2: 0.023954448373224357
Random State: 56
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set R^2: 0.026278906862495677
    Baseline R^2 (test set): -0.11647686627314813
    Test Set R^2: 0.01030462361864104
Random State: 84
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set R^2: 0.03619946561726639
    Baseline R^2 (test set): -0.11586967058288655
    Test Set R^2: 0.02183229113960694
Random State: 112
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set R^2: 0.019699290944369285
    Baseline R^

In [36]:
print('****Metrics with SVR:****')
mean = np.mean(svr_test_scores_r2)
std = np.std(svr_test_scores_r2)
models_r2s['SVR']['mean'] = mean
models_r2s['SVR']['std dev'] = std
print('Mean R^2:', mean)
print('Std. Deviation of R^2:', std)

****Metrics with SVR:****
Mean R^2: 0.01398217343523428
Std. Deviation of R^2: 0.009454333984253098


In [37]:
file = open('../results/svr_r2.save', 'wb')
best_model_index = np.argmax(svr_test_scores_r2)
pickle.dump((svr_best_models_r2[best_model_index], svr_test_Xs_r2[best_model_index], svr_test_ys_r2[best_model_index]), file)
file.close()

#### (6) XGBoost

In [38]:
xgb = xgboost.XGBRegressor(seed=0, n_estimators=10000, learning_rate=0.03, colsample_bytree=0.9, subsample=0.66)
xgb_params = {
    'xgbregressor__reg_alpha': [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    'xgbregressor__lambda': [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    'xgbregressor__max_depth': [1, 3, 10, 30, 100]
}
xgb_test_scores_r2, xgb_best_models_r2, xgb_test_Xs_r2, xgb_test_ys_r2 = MLpipe_Stratified_Continous_r2(X, y, preprocessor=preprocessor, ML_algo=xgb, param_grid=xgb_params, xgb=True)

Random State: 0
    Best Model Parameters: {'xgbregressor__reg_alpha': 0.0, 'xgbregressor__max_depth': 1, 'xgbregressor__lambda': 0.0}
    Validation Set R^2: 0.06888965978216255
    Baseline R^2 (test set): -0.11204940797462459
    Test Set R^2: 0.061355854037360213
Random State: 28
    Best Model Parameters: {'xgbregressor__reg_alpha': 0.0, 'xgbregressor__max_depth': 1, 'xgbregressor__lambda': 0.0}
    Validation Set R^2: 0.06078991861574057
    Baseline R^2 (test set): -0.1161236477394243
    Test Set R^2: 0.07523657428097119
Random State: 56
    Best Model Parameters: {'xgbregressor__reg_alpha': 0.0, 'xgbregressor__max_depth': 1, 'xgbregressor__lambda': 0.0}
    Validation Set R^2: 0.07385113705183421
    Baseline R^2 (test set): -0.11647686627314813
    Test Set R^2: 0.06656322734745945
Random State: 84
    Best Model Parameters: {'xgbregressor__reg_alpha': 0.0, 'xgbregressor__max_depth': 1, 'xgbregressor__lambda': 0.0}
    Validation Set R^2: 0.07493343350994752
    Baseline R^2 

In [39]:
print('****Metrics with XGBoost:****')
mean = np.mean(xgb_test_scores_r2)
std = np.std(xgb_test_scores_r2)
models_r2s['XGBoost']['mean'] = mean
models_r2s['XGBoost']['std dev'] = std
print('Mean R^2:', mean)
print('Std. Deviation of R^2:', std)

****Metrics with XGBoost:****
Mean R^2: 0.06928781113377777
Std. Deviation of R^2: 0.004836393838912789


In [40]:
file = open('../results/xgboost_r2.save', 'wb')
best_model_index = np.argmax(xgb_test_scores_r2)
pickle.dump((xgb_best_models_r2[best_model_index], xgb_test_Xs_r2[best_model_index], xgb_test_ys_r2[best_model_index]), file)
file.close()

### Results

In [41]:
models_r2s

Unnamed: 0_level_0,l1,l2,elastic net,random forest,SVR,XGBoost
R^2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mean,-3e-06,0.067913,-2e-06,-0.087884,0.013982,0.069288
std dev,2e-06,0.006687,2e-06,0.013756,0.009454,0.004836
