**Functions**

In [None]:
def run_regressors(X, y):
    
    """
    This method take X and y and runs the regressors used for this paper
    """
    # specify number of cross validation foles
    num_folds = 10
    
    # setup data structures to hold the results
    lr_results = -1*np.ones(num_folds, dtype=float)
    rr_results = -1*np.ones(num_folds, dtype=float)
    lasso_results = -1*np.ones(num_folds, dtype=float)
    EN_results = -1*np.ones(num_folds, dtype=float)
    RF_results = -1*np.ones(num_folds, dtype=float)
    GB_results = -1*np.ones(num_folds, dtype=float)
    AB_results = -1*np.ones(num_folds, dtype=float)
    
    # these variables are created for stratified cross validation
    y_cat, bins = pd.cut(y, bins = [10, 60, 70, 80, 90, 100, 110, 150], labels = range(7), retbins=True, ordered = True)
    y_cat_2 = pd.factorize( y_cat )[0]
    
    cat = pd.DataFrame({'VABS':y_cat_2, 'Sex':X.sex, 'Dx':X.diagnosis})
    cat['combination'] = cat[['Sex', 'Dx','VABS']].agg(tuple, axis=1)
    cat['dummy'] = cat['combination'].factorize()[0]

    # stratified cross valudation
    skf = StratifiedKFold(n_splits=num_folds, shuffle = True, random_state = 20)

    fold = 0
    
    # run the classification models for each fold
    for train, test in skf.split(X, cat['dummy']):
        
        print('*** fold: ', fold, ' ***')
        
        # get the training and testing sets
        X_train = X.iloc[train,:]
        X_test = X.iloc[test,:]
        y_train = y.iloc[train]
        y_test = y.iloc[test]
        
        # scale the inputs
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = pd.DataFrame(scaler.transform(X_train))
        X_test_scaled = pd.DataFrame(scaler.transform(X_test))
    

        # Linear regression
        lr = LinearRegression().fit(X_train, y_train)
        lr_results[fold]=median_absolute_error(y_test,lr.predict(X_test))

        # Ridge
        clf = RidgeCV(alphas=[0.001,0.01,1,10]).fit(X_train,y_train)
        rr = Ridge(alpha=clf.alpha_).fit(X_train, y_train) 
        rr_results[fold]= median_absolute_error(y_test,rr.predict(X_test))
        print('ridge: ',clf.alpha_) 

        # Lasso
        ## parameter selection
        clf = LassoCV(alphas=[0.001,0.01,1,10]).fit(X_train, y_train)
        model_lasso = Lasso(alpha=clf.alpha_).fit(X_train, y_train) 
        lasso_results[fold]= median_absolute_error(y_test,model_lasso.predict(X_test))
        print('lasso: ',clf.alpha_) 

        # ElasticNet
        ## parameter selection
        clf = ElasticNetCV(l1_ratio=[0.01, .1, .5, .95, .99, 1], alphas=[0.01,0.1,0.5,1]).fit(X_train, y_train)
        model_EN = ElasticNet(alpha=clf.alpha_, l1_ratio=clf.l1_ratio_).fit(X_train, y_train) 
        EN_results[fold]= median_absolute_error(y_test,model_EN.predict(X_test))
        print('EN: ',clf.alpha_, clf.l1_ratio_) 

  
        # Adaboost
        ## parameter selection
        param_grid = {
            'n_estimators' : np.arange(start=5, stop=80, step=10),
            'learning_rate': np.arange(start=0.001, stop=0.01, step=0.001)
        }
        ab = AdaBoostRegressor(LinearRegression(), loss='square',random_state = 0)
        regressor = GridSearchCV(ab, param_grid,  
                        cv = KFold(n_splits=5, shuffle = True, random_state = 0))
        regressor.fit(X_train, y_train)
        regressor.best_estimator_.fit(X_train, y_train) 
        AB_results[fold]=median_absolute_error(y_test,regressor.best_estimator_.predict(X_test))
        print('AB: ',regressor.best_estimator_.n_estimators, regressor.best_estimator_.learning_rate) 

        # Random forest
        ## parameter selection
        param_grid = {
            'max_features': np.arange(start=1, stop=X.shape[1], step=1),
            'n_estimators': np.arange(start=50, stop=100, step=10)
        }
        rf = RandomForestRegressor(criterion='absolute_error',random_state = 0)
        regressor = GridSearchCV(rf, param_grid, 
                        scoring = 'neg_mean_absolute_error', 
                        cv = KFold(n_splits=5, shuffle = True, random_state = 0))
        regressor.fit(X_train, y_train)
        regressor.best_estimator_.fit(X_train, y_train) 
        RF_results[fold]=median_absolute_error(y_test,regressor.best_estimator_.predict(X_test))
        print('RF:', regressor.best_estimator_.n_estimators, regressor.best_estimator_.max_features)

        # Gradient boosting
        ## parameter selection
        param_grid = {
            'max_depth': np.arange(start=10, stop=50, step=10),
            'n_estimators': np.arange(start=60, stop=100, step=10)
        }
        gb = GradientBoostingRegressor(learning_rate=0.1, random_state=0, loss='absolute_error')
        gbregressor = GridSearchCV(gb, param_grid, scoring = 'neg_mean_absolute_error', 
                    cv = KFold(n_splits=5, shuffle = True, random_state = 0))
        gbregressor.fit(X_train, y_train)
        gbregressor.best_estimator_.fit(X_train, y_train) 
        GB_results[fold]=median_absolute_error(y_test,gbregressor.best_estimator_.predict(X_test))   

        fold = fold + 1
    
    # print the results
    print('*****')
    print('lr   : ', np.median(lr_results), '+/-', np.percentile(lr_results, 75)-np.percentile(lr_results, 25))
    print('rr   : ', np.median(rr_results), '+/-', np.percentile(rr_results, 75)-np.percentile(rr_results, 25))
    print('lasso: ', np.median(lasso_results), '+/-', np.percentile(lasso_results, 75)-np.percentile(lasso_results, 25))
    print('EN   : ', np.median(EN_results), '+/-', np.percentile(EN_results, 75)-np.percentile(EN_results, 25))
    print('RF   : ', np.median(RF_results), '+/-', np.percentile(RF_results, 75)-np.percentile(RF_results, 25))
    print('AB   : ', np.median(AB_results), '+/-', np.percentile(AB_results, 75)-np.percentile(AB_results, 25))