# The ML Pipeline for a Model to Predict Length of Hospital Delivery Stay

In [1]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import matplotlib
from math import ceil
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import PredefinedSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ParameterGrid

from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# Gather Data

In [2]:
# specifying data types for the columns to maintain formatting from original data
data_types = {
    'hospital_service_area': object, 
    'hospital_county': object,
    'operating_certificate_number': object, 
    'permanent_facility_id': object,
    'facility_name': object, 
    'age_group': object, 
    'zip_code_3_digits': object, 
    'gender': object, 
    'race': object,
    'ethnicity': object, 
    'payment_typology_1': object, 
    'payment_typology_2': object,
    'payment_typology_3': object, 
    'length_of_stay': int
}

In [3]:
all_visits = pd.read_csv('../data/planned_deliveries.csv', dtype=data_types)
all_visits = all_visits.loc[:, all_visits.columns != 'Unnamed: 0']
y = all_visits['length_of_stay']
X = all_visits.loc[:, all_visits.columns != 'length_of_stay']

# Compute Baseline Score

In [4]:
mean_length_of_stay = int(np.around(np.mean(y), 0))
median_length_of_stay = int(np.around(np.median(y), 0))
print('Mean length of stay:', mean_length_of_stay)
print('Median length of stay:', median_length_of_stay)

Mean length of stay: 2
Median length of stay: 2


In [5]:
y_pred_mean = pd.Series([2]*len(y))

#### RMSE [days]

In [7]:
baseline_rmse = mean_squared_error(y, y_pred_mean, squared=False)
print('Baseline RMSE [days]:', baseline_rmse)

Baseline RMSE [days]: 0.9521077950265668


#### R^2 [dimensionless]  
**Pretty sure this is unnecessary to do though since by definition R^2 should = 0 for the expected (average) guess of y**

In [8]:
baseline_r2 = r2_score(y, y_pred_mean)
print('Baseline R^2 [dimensionless]:', baseline_r2)

Baseline R^2 [dimensionless]: -0.11501180761222085


# Split, Train, and Cross Validate - RMSE Evaluation Metric

### Set up functions for automated pipeline

In [9]:
def stratified_continuous_split(X:pd.DataFrame, y:pd.Series, train_size:float, val_size:float, test_size:float, random_state:int):
    '''
    Performs a stratified split of inputted data (with respect to y) into a training set, validation set, and test set to specified percentages 
    of the data using verstack's scsplit and performs basic error checking.

    Parameters:
    - X: a 2D pandas DataFrame, the feature matrix
    - y: a 1D pandas Series, the target variable matrix matching X
    - train_size: a float between 0 and 1, the percentage of X which should be training data
    - val_size: a float between 0 and 1, the percentage of X which should be reserved for validation
    - test_size: a float between 0 and 1, the percentage of X which should be reserved for final testing
    - random_state: an int, the random state to split with
    Note: The sum of train_size + val_size + test_size must be 1.0 (100% of X).

    Returns:
    - (X_train) a 2D pandas DataFrame, the feature matrix of training data
    - (y_train) a 1D pandas Series, the target variable matrix for training data
    - (X_val) a 2D pandas DataFrame, the feature matrix of validation data
    - (y_val) a 1D pandas Series, the target variable matrix for validation data
    - (X_test) a 2D pandas DataFrame, the feature matrix of testing data
    - (y_test) a 1D pandas Series, the target variable matrix for testing data

    Raises:
    - ValueError for invalid input
    '''
    from verstack.stratified_continuous_split import scsplit
    
    if ((train_size + val_size + test_size) != 1):
        raise ValueError('Your train_size + val_size + test_size must add up to 1 (100%)!')
    if (not isinstance(random_state, int)):
        raise ValueError('Your random_state must be an int!')

    X_train, X_other, y_train, y_other = scsplit(X, y, stratify=y, test_size=(1-train_size), random_state=random_state)
    
    X_len = X.shape[0]
    test_percent_of_other = (test_size * X_len)/(X_len - (train_size * X_len))
    X_other = X_other.reset_index(drop=True)
    y_other = y_other.reset_index(drop=True)
    
    X_val, X_test, y_val, y_test = scsplit(X_other, y_other, stratify=y_other, test_size=test_percent_of_other, random_state=random_state)

    # basic error checking to check that split returned train, val, and test of expected sizes
    train_count_low = (int)(train_size * X_len)
    train_count_high = ceil(train_size * X_len)
    val_count_low = (int)(val_size * X_len)
    val_count_high = ceil(val_size * X_len)
    test_count_low = (int)(test_size * X_len)
    test_count_high = ceil(test_size * X_len)
    
    Xtrain_fin = X_train.shape[0]
    ytrain_fin = y_train.shape[0]
    Xval_fin = X_val.shape[0]
    yval_fin = y_val.shape[0]
    Xtest_fin = X_test.shape[0]
    ytest_fin = y_test.shape[0]
    
    if not (((Xtrain_fin == train_count_low) or (Xtrain_fin == train_count_high)) and ((ytrain_fin == train_count_low) or (ytrain_fin == train_count_high))):
        raise ValueError(f'Training set size should be approx. {train_size * X_len}, instead is: {X_train.shape[0]}')
    if not (((Xval_fin == val_count_low) or (Xval_fin == val_count_high)) and ((yval_fin == val_count_low) or (yval_fin == val_count_high))):
        raise ValueError(f'Validation set size should be approx. {val_size * X_len}, instead is: {X_val.shape[0]}')
    if not (((Xtest_fin == test_count_low) or (Xtest_fin == test_count_high)) and ((ytest_fin == test_count_low) or (ytest_fin == test_count_high))):
        raise ValueError(f'Test set size should be approx. {test_size * X_len}, instead is: {X_test.shape[0]}')

    return X_train, y_train, X_val, y_val, X_test, y_test

In [22]:
def MLpipe_Stratified_Continous_RMSE(X, y, preprocessor, ML_algo, param_grid, xgb=False):
    '''
    This function splits the data to train, validation, and test (60/20/20).
    The RMSE is minimized in cross-validation.
    
    This function:
    1. Loops through 10 different random states
    2. Splits the data 60/20/20.
    3. Fits a model with the predefined Preprocessor, trains the model with each hyperparameter combination in param_grid
    4. Calculates the model's error on the test set on the model wuth the best hyperparameter combinations in param_grid
    5. Returns a list of 10 test scores and 10 best models
    '''
    
    # lists to be returned
    test_scores = []
    best_models = []
    test_Xs = []
    test_ys = []

    nr_states = 10
    for i in range(nr_states):
        rs = 28 * i
        print('Random State:', rs)

        # split
        X_train, y_train, X_val, y_val, X_test, y_test = stratified_continuous_split(X, y, train_size=0.6, val_size=0.2, test_size=0.2, random_state=rs)

        # preprocess
        X_train_prep = preprocessor.fit_transform(X_train)
        X_val_prep = preprocessor.transform(X_val)
        X_test_prep = preprocessor.transform(X_test)

        # final preprocess with Standard Scaler so that I can use the coefficients of linear models as global importance metrics
        final_scaler = StandardScaler()
        X_train_prep = final_scaler.fit_transform(X_train_prep)
        X_val_prep = final_scaler.transform(X_val_prep)
        X_test_prep = final_scaler.transform(X_test_prep)

        test_Xs.append(X_test_prep)
        test_ys.append(y_test)

        # train and perform cross-validation        
        models = []
        val_scores = []
        for p in range(len(ParameterGrid(param_grid))):
            params = ParameterGrid(param_grid)[p]
            # print(' ',params) # TEMPORARY

            if (xgb):
                clf = ML_algo.fit(X_train_prep, y_train, early_stopping_rounds=50, eval_set=[(X_val_prep, y_val)], verbose=False)
            else:
                clf = ML_algo.fit(X_train_prep, y_train)
            models.append(clf)
            y_val_pred = clf.predict(X_val_prep)
            val_scores.append(mean_squared_error(y_val, y_val_pred, squared=False))
            # print(' Validation RMSE:', val_scores[-1]) # TEMPORARY

        # save results
        print('    Best Model Parameters:', ParameterGrid(param_grid)[np.argmin(val_scores)])
        print('    Validation Set RMSE:', np.min(val_scores))
        best_model_this_rs = models[np.argmin(val_scores)]
        best_models.append(best_model_this_rs)
        y_test_pred = best_model_this_rs.predict(X_test_prep)
        test_score = mean_squared_error(y_test, y_test_pred, squared=False)
        test_scores.append(test_score)
        print('    Baseline RMSE (test set):', mean_squared_error(y_test, pd.Series([2]*len(y_test)), squared=False))
        print('    Test Set RMSE:', test_score)
        
    return test_scores, best_models, test_Xs, test_ys

### Run and Cross Validate Several Models - RMSE

In [11]:
random_state = 42

# categorizing the columns in my dataset by how they should be encoded
onehot_ftrs = ['hospital_service_area', 'hospital_county', 'operating_certificate_number', 'permanent_facility_id', \
               'facility_name', 'zip_code_3_digits', 'gender', 'race', 'ethnicity', 'payment_typology_1', \
               'payment_typology_2', 'payment_typology_3']
ordinal_ftrs = ['age_group']
ordinal_cats = [['0 to 17', '18 to 29', '30 to 49', '50 to 69', '70 or Older']]

# replace missing values in categorical columns with 'not reported'
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='not reported')),
    ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])

# my data has no missing values in its ordinal column, so only encoding is necessary
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', categorical_transformer, onehot_ftrs),
        ('ordinal', ordinal_transformer, ordinal_ftrs)])

In [12]:
# keeping track of scores
models_rmses = pd.DataFrame(columns=['RMSE', 'l1', 'l2', 'elastic net', 'random forest', 'SVR', 'XGBoost'])
models_rmses['RMSE'] = pd.Series(['mean', 'std dev'])
models_rmses = models_rmses.set_index('RMSE')

#### (1) Linear Regression with l1 Regularization

In [35]:
lin_reg_l1 = Lasso(random_state=random_state)
l1_params = {
    'lasso__alpha': np.linspace(math.exp(-2), math.exp(2), 21)
}
l1_test_scores, l1_best_models, l1_test_Xs, l1_test_ys = MLpipe_Stratified_Continous_RMSE(X, y, preprocessor=preprocessor, ML_algo=lin_reg_l1, param_grid=l1_params)

Random State: 0
    Best Model Parameters: {'lasso__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8865510222531405
    Baseline RMSE (test set): 0.9537287408033268
    Test Set RMSE: 0.9034748141373455
Random State: 28
    Best Model Parameters: {'lasso__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8943722755526549
    Baseline RMSE (test set): 0.9463120596425579
    Test Set RMSE: 0.8957330448081982
Random State: 56
    Best Model Parameters: {'lasso__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8943724758747483
    Baseline RMSE (test set): 0.9442024217513739
    Test Set RMSE: 0.8935953196069969
Random State: 84
    Best Model Parameters: {'lasso__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8935953196069968
    Baseline RMSE (test set): 0.9447654524643497
    Test Set RMSE: 0.8943724758747483
Random State: 112
    Best Model Parameters: {'lasso__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.9160331114616256
    Baseline RMSE (test set

In [36]:
print('****Metrics with l1 Linear Regression:****')
mean = np.mean(l1_test_scores)
std = np.std(l1_test_scores)
models_rmses['l1']['mean'] = mean
models_rmses['l1']['std dev'] = std
print('Mean RMSE:', mean)
print('Std. Deviation of RMSE:', std)

****Metrics with l1 Linear Regression:****
Mean RMSE: 0.8993595838921766
Std. Deviation of RMSE: 0.0055407996038346575


In [37]:
file = open('../results/lin_reg_l1.save', 'wb')
best_model_index = np.argmin(l1_test_scores)
pickle.dump((l1_best_models[best_model_index], l1_test_Xs[best_model_index], l1_test_ys[best_model_index]), file)
file.close()

#### (2) Linear Regression with l2 Regularization

In [38]:
lin_reg_l2 = Ridge(random_state=random_state)
l2_params = {
    'ridge__alpha': np.linspace(math.exp(-2), math.exp(2), 21)
}
l2_test_scores, l2_best_models, l2_test_Xs, l2_test_ys = MLpipe_Stratified_Continous_RMSE(X, y, preprocessor=preprocessor, ML_algo=lin_reg_l2, param_grid=l2_params)

Random State: 0
    Best Model Parameters: {'ridge__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8573809137552494
    Baseline RMSE (test set): 0.9537287408033268
    Test Set RMSE: 0.8751860016356476
Random State: 28
    Best Model Parameters: {'ridge__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8663862212169129
    Baseline RMSE (test set): 0.9463120596425579
    Test Set RMSE: 0.8598233472524134
Random State: 56
    Best Model Parameters: {'ridge__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8625293918481787
    Baseline RMSE (test set): 0.9442024217513739
    Test Set RMSE: 0.8671131350438647
Random State: 84
    Best Model Parameters: {'ridge__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8577693591578244
    Baseline RMSE (test set): 0.9447654524643497
    Test Set RMSE: 0.8607684974498052
Random State: 112
    Best Model Parameters: {'ridge__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8886716828086008
    Baseline RMSE (test set

In [39]:
print('****Metrics with l2 Linear Regression:****')
mean = np.mean(l2_test_scores)
std = np.std(l2_test_scores)
models_rmses['l2']['mean'] = mean
models_rmses['l2']['std dev'] = std
print('Mean RMSE:', mean)
print('Std. Deviation of RMSE:', std)

****Metrics with l2 Linear Regression:****
Mean RMSE: 0.8682782189523175
Std. Deviation of RMSE: 0.006357476611995466


In [40]:
file = open('../results/lin_reg_l2.save', 'wb')
best_model_index = np.argmin(l2_test_scores)
pickle.dump((l2_best_models[best_model_index], l2_test_Xs[best_model_index], l2_test_ys[best_model_index]), file)
file.close()

#### (3) Linear Regression with Elastic Net Regularization

In [43]:
lin_reg_elastic = ElasticNet(random_state=random_state)
elastic_params = {
    'elasticnet__alpha': np.linspace(math.exp(-2), math.exp(2), 21),
    'elasticnet__l1_ratio': np.linspace(0, 1, 21)
}
elastic_test_scores, elastic_best_models, elastic_test_Xs, elastic_test_ys = MLpipe_Stratified_Continous_RMSE(X, y, preprocessor=preprocessor, ML_algo=lin_reg_elastic, param_grid=elastic_params)

Random State: 0
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.9071697063741746
    Baseline RMSE (test set): 0.9313015197057405
    Test Set RMSE: 0.8806008429832736
Random State: 28
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8943722755526549
    Baseline RMSE (test set): 0.9463120596425579
    Test Set RMSE: 0.8957330448081982
Random State: 56
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8943724758747483
    Baseline RMSE (test set): 0.9442024217513739
    Test Set RMSE: 0.8935953196069969
Random State: 84
    Best Model Parameters: {'elasticnet__l1_ratio': 0.0, 'elasticnet__alpha': 0.1353352832366127}
    Validation Set RMSE: 0.8935953196069968
    Baseline RMSE (test set): 0.9447654524643497
    Test Set RMSE: 0.8943724758747483
Random State: 112

In [44]:
print('****Metrics with Elastic Net Linear Regression:****')
mean = np.mean(elastic_test_scores)
std = np.std(elastic_test_scores)
models_rmses['elastic net']['mean'] = mean
models_rmses['elastic net']['std dev'] = std
print('Mean RMSE:', mean)
print('Std. Deviation of RMSE:', std)

****Metrics with Elastic Net Linear Regression:****
Mean RMSE: 0.8970721867767695
Std. Deviation of RMSE: 0.00767878888483994


In [45]:
file = open('../results/lin_reg_elastic_net.save', 'wb')
best_model_index = np.argmin(elastic_test_scores)
pickle.dump((elastic_best_models[best_model_index], elastic_test_Xs[best_model_index], elastic_test_ys[best_model_index]), file)
file.close()

#### (4) Random Forest Regressor

In [46]:
random_forest_reg = RandomForestRegressor(n_jobs=-1, n_estimators=100, random_state=random_state)
rf_params = {
    'randomforestregressor__max_features': [1, 3, 10, 30],
    'randomforestregressor__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
rf_test_scores, rf_best_models, rf_test_Xs, rf_test_ys = MLpipe_Stratified_Continous_RMSE(X, y, preprocessor=preprocessor, ML_algo=random_forest_reg, param_grid=rf_params)

Random State: 0
    Best Model Parameters: {'randomforestregressor__max_features': 1, 'randomforestregressor__max_depth': 1}
    Validation Set RMSE: 0.9440919009659395
    Baseline RMSE (test set): 0.9537287408033268
    Test Set RMSE: 0.9428757523449156
Random State: 28
    Best Model Parameters: {'randomforestregressor__max_features': 10, 'randomforestregressor__max_depth': 3}
    Validation Set RMSE: 0.9275653694755235
    Baseline RMSE (test set): 0.9463120596425579
    Test Set RMSE: 0.9398844585632351
Random State: 56
    Best Model Parameters: {'randomforestregressor__max_features': 1, 'randomforestregressor__max_depth': 1}
    Validation Set RMSE: 0.9274641501241876
    Baseline RMSE (test set): 0.9442024217513739
    Test Set RMSE: 0.933730980542309
Random State: 84
    Best Model Parameters: {'randomforestregressor__max_features': 30, 'randomforestregressor__max_depth': 1}
    Validation Set RMSE: 0.9305690245471583
    Baseline RMSE (test set): 0.9447654524643497
    Test S

In [47]:
print('****Metrics with Random Forest Regression:****')
mean = np.mean(rf_test_scores)
std = np.std(rf_test_scores)
models_rmses['random forest']['mean'] = mean
models_rmses['random forest']['std dev'] = std
print('Mean RMSE:', mean)
print('Std. Deviation of RMSE:', std)

****Metrics with Random Forest Regression:****
Mean RMSE: 0.9379610533562293
Std. Deviation of RMSE: 0.00909099606957027


In [48]:
file = open('../results/random_forest_regressor.save', 'wb')
best_model_index = np.argmin(rf_test_scores)
pickle.dump((rf_best_models[best_model_index], rf_test_Xs[best_model_index], rf_test_ys[best_model_index]), file)
file.close()

#### (5) Support Vector Regression (SVR)

In [12]:
from sklearn.svm import SVR

In [13]:
svr = SVR()
svr_params = {
    'svr__gamma': [1e-3, 1e-1, 1e1, 1e3, 1e5],
    'svr__C': [1e-1, 1e0, 1e1]
}
svr_test_scores, svr_best_models, svr_test_Xs, svr_test_ys = MLpipe_Stratified_Continous_RMSE(X, y, preprocessor=preprocessor, ML_algo=svr, param_grid=svr_params)

Random State: 0
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set RMSE: 0.8737161388388178
    Baseline RMSE (test set): 0.9566515208850047
    Test Set RMSE: 0.8990157701174678
Random State: 28
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set RMSE: 0.8945910250978423
    Baseline RMSE (test set): 0.9463120596425579
    Test Set RMSE: 0.8849388528798543
Random State: 56
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set RMSE: 0.8825403205768152
    Baseline RMSE (test set): 0.9442024217513739
    Test Set RMSE: 0.8889780059159369
Random State: 84
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set RMSE: 0.8772711088587583
    Baseline RMSE (test set): 0.9447654524643497
    Test Set RMSE: 0.8845531390704325
Random State: 112
    Best Model Parameters: {'svr__gamma': 0.001, 'svr__C': 0.1}
    Validation Set RMSE: 0.9069651444907146
    Baseline RMSE (test set

In [14]:
print('****Metrics with Support Vector Regression:****')
mean = np.mean(svr_test_scores)
std = np.std(svr_test_scores)
models_rmses['SVR']['mean'] = mean
models_rmses['SVR']['std dev'] = std
print('Mean RMSE:', mean)
print('Std. Deviation of RMSE:', std)

****Metrics with Support Vector Regression:****
Mean RMSE: 0.8925664259157626
Std. Deviation of RMSE: 0.008012066615216527


In [15]:
file = open('../results/svr.save', 'wb')
best_model_index = np.argmin(svr_test_scores)
pickle.dump((svr_best_models[best_model_index], svr_test_Xs[best_model_index], svr_test_ys[best_model_index]), file)
file.close()

#### (6) XGBoost

In [14]:
import xgboost

In [24]:
xgb = xgboost.XGBRegressor(seed=0, n_estimators=10000, learning_rate=0.03, colsample_bytree=0.9, subsample=0.66)
xgb_params = {
    'xgbregressor__reg_alpha': [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    'xgbregressor__lambda': [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
    'xgbregressor__max_depth': [1, 3, 10, 30, 100]
}
xgb_test_scores, xgb_best_models, xgb_test_Xs, xgb_test_ys = MLpipe_Stratified_Continous_RMSE(X, y, preprocessor=preprocessor, ML_algo=xgb, param_grid=xgb_params, xgb=True)

Random State: 0
    Best Model Parameters: {'xgbregressor__reg_alpha': 0.0, 'xgbregressor__max_depth': 1, 'xgbregressor__lambda': 0.0}
    Validation Set RMSE: 0.8892998064417896
    Baseline RMSE (test set): 0.9447654524643497
    Test Set RMSE: 0.8646508617113036
Random State: 28


KeyboardInterrupt: 

In [None]:
print('****Metrics with XGB Regressor:****')
mean = np.mean(xgb_test_scores)
std = np.std(xgb_test_scores)
models_rmses['XGBoost']['mean'] = mean
models_rmses['XGBoost']['std dev'] = std
print('Mean RMSE:', mean)
print('Std. Deviation of RMSE:', std)

In [None]:
file = open('../results/xgboost.save', 'wb')
best_model_index = np.argmin(xgb_test_scores)
pickle.dump((xgb_best_models[best_model_index], xgb_test_Xs[best_model_index], xgb_test_ys[best_model_index]), file)
file.close()

### RESULTS

In [None]:
models_rmses

# Split, Train, and Cross Validate - RMSE Evaluation Metric

### Set up functions for automated pipeline

In [None]:
def MLpipe_Stratified_Continous_r2(X, y, preprocessor, ML_algo, param_grid, xgb=False):
    '''
    This function splits the data to train, validation, and test (60/20/20).
    The R^2 is maximized in cross-validation.
    
    This function:
    1. Loops through 10 different random states
    2. Splits the data 60/20/20.
    3. Fits a model with the predefined Preprocessor, trains the model with each hyperparameter combination in param_grid
    4. Calculates the model's error on the test set on the model with the best hyperparameter combinations in param_grid
    5. Returns a list of 10 test scores and 10 best models
    '''
    
    # lists to be returned
    test_scores = []
    best_models = []
    test_Xs = []
    test_ys = []

    nr_states = 10
    for i in range(nr_states):
        rs = 28 * i
        print('Random State:', rs)

        # split
        X_train, y_train, X_val, y_val, X_test, y_test = stratified_continuous_split(X, y, train_size=0.6, val_size=0.2, test_size=0.2, random_state=rs)

        # preprocess
        X_train_prep = preprocessor.fit_transform(X_train)
        X_val_prep = preprocessor.transform(X_val)
        X_test_prep = preprocessor.transform(X_test)

        # final preprocess with Standard Scaler so that I can use the coefficients of linear models as global importance metrics
        final_scaler = StandardScaler()
        X_train_prep = final_scaler.fit_transform(X_train_prep)
        X_val_prep = final_scaler.transform(X_val_prep)
        X_test_prep = final_scaler.transform(X_test_prep)

        test_Xs.append(X_test_prep)
        test_ys.append(y_test)

        # train and perform cross-validation        
        models = []
        val_scores = []
        for p in range(len(ParameterGrid(param_grid))):
            params = ParameterGrid(param_grid)[p]
            # print(' ',params) # TEMPORARY

            if (xgb):
                clf = ML_algo.fit(X_train_prep, y_train, early_stopping_rounds=50, eval_set=[(X_val_prep, y_val)])
            else:
                clf = ML_algo.fit(X_train_prep, y_train)
            models.append(clf)
            y_val_pred = clf.predict(X_val_prep)
            val_scores.append(r2_score(y_val, y_val_pred))
            # print(' Validation R^2:', val_scores[-1]) # TEMPORARY

        # save results
        print('    Best Model Parameters:', ParameterGrid(param_grid)[np.argmax(val_scores)])
        print('    Validation Set R^2:', np.max(val_scores))
        best_model_this_rs = models[np.argmax(val_scores)]
        best_models.append(best_model_this_rs)
        y_test_pred = best_model_this_rs.predict(X_test_prep)
        test_score = r2_score(y_test, y_test_pred)
        test_scores.append(test_score)
        print('    Baseline R^2 (test set):', r2_score(y_test, pd.Series([2]*len(y_test))))
        print('    Test Set R^2:', test_score)
        
    return test_scores, best_models, test_Xs, test_ys

### Run and Cross Validate Several Models - R^2

In [None]:
# keeping track of scores
models_r2s = pd.DataFrame(columns=['R^2', 'l1', 'l2', 'elastic net', 'random forest', 'SVR', 'XGBoost'])
models_r2s['R^2'] = pd.Series(['mean', 'std dev'])
models_r2s = models_r2s.set_index('R^2')

#### (1) Linear Regression with l1 Regularization

In [None]:
lin_reg_l1 = Lasso(random_state=random_state)
l1_params = {
    'lasso__alpha': np.linspace(math.exp(-2), math.exp(2), 21)
}
l1_test_scores_r2, l1_best_models_r2, l1_test_Xs_r2, l1_test_ys_r2 = MLpipe_Stratified_Continous_r2(X, y, preprocessor=preprocessor, ML_algo=lin_reg_l1, param_grid=l1_params)