## Gradient Boosting Regression

### Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy.stats as stats
import math

!pip install sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

You should consider upgrading via the '/homes/iws/bhimar/cse481ds-mental-health/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
X_train_full = pd.read_csv("X_train.csv")
X_train_full.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,...,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.847,0.096,0.568,4885.0,60735.0,54.808333,26.6,36.3,44.2,55.8,...,11.01,5.75,6.53,2.99,2.16,5.3,3.1,5.95,4.9,1.0
1,0.78,0.066812,0.547429,954.495385,42945.0,74.058333,59.9,61.5,71.2,72.5,...,1.64,7.55,8.57,3.95,6.58,8.25,3.81,0.24,1.01,1.0
2,0.78,0.069824,0.453978,2573.647059,40994.0,60.825,38.5,42.9,55.2,59.1,...,2.69,3.51,2.22,3.8,5.73,0.96,0.54,1.99,3.87,6.0
3,0.866534,0.059,0.785187,4209.193548,81586.0,43.408333,16.3,17.3,24.8,39.0,...,5.51,5.31,5.99,3.51,1.34,1.62,3.07,0.53,1.26,1.0
4,0.806995,0.102,0.681151,4158.038961,43863.0,60.991667,46.3,44.2,46.8,61.4,...,4.63,3.1,10.77,8.4,5.02,1.72,1.12,2.85,4.85,2.0


In [3]:
Y_train_full = pd.read_csv("Y_train.csv")
Y_train_full.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,4.1,16.6
1,4.2,11.0
2,3.9,24.9
3,2.3,8.7
4,4.0,16.8


### Gradient Boosting Regression for predicting Poor Mental Health Days

In [4]:
OUTER_FOLDS = 10
INNER_FOLDS = 10
N_REPEATS = 3

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=N_REPEATS, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]
  
  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = GradientBoostingRegressor()
    
  # col 0 is Poor Mental Health Days
  model.fit(X_train, Y_train.iloc[:,0]) 

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,0], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,0], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,0], predictions)
  print("r2 = " + str(r2))

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)



  

mae = 0.37466346955918944
rmse = 0.5208091846689575
r2 = 0.36367608476847035
mae = 0.3815379338022162
rmse = 0.5016646780698445
r2 = 0.40225109161917394
mae = 0.3552581205541896
rmse = 0.4827035442544508
r2 = 0.5007737516419644
mae = 0.376167553546219
rmse = 0.4986540235263049
r2 = 0.4412588588548325
mae = 0.3688461602139794
rmse = 0.49299678369928673
r2 = 0.41133187693438966
mae = 0.3600342182700696
rmse = 0.4728614605974149
r2 = 0.46272011530159074
mae = 0.3810226527737829
rmse = 0.5101807554425762
r2 = 0.40631222568697667
mae = 0.37054006154418007
rmse = 0.5050018454295467
r2 = 0.4376421998713881
mae = 0.36196679268732784
rmse = 0.5109562886253105
r2 = 0.471473231818469
mae = 0.3659438727117062
rmse = 0.502147694496696
r2 = 0.39195281386914294
mae = 0.3465400223031497
rmse = 0.4666249194130657
r2 = 0.49034659854774887
mae = 0.37774050053047314
rmse = 0.5176906880158516
r2 = 0.37840829017423616
mae = 0.38333305130973405
rmse = 0.5288824640742219
r2 = 0.417065594294893
mae = 0.3716338

In [5]:
averageMAE = sum(allMAEs)/(OUTER_FOLDS * N_REPEATS)
averageRMSE = sum(allRMSEs)/(OUTER_FOLDS * N_REPEATS)
averageR2 = sum(allR2s)/(OUTER_FOLDS * N_REPEATS)


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 6

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Gradient Boosting Regression Poor Mental Health Days", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)

### Gradient Boosting Regression for predicting Suicides

In [6]:
OUTER_FOLDS = 10
INNER_FOLDS = 10
N_REPEATS = 3

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=N_REPEATS, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]

  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = GradientBoostingRegressor()
  # col 1 is Suicides
  model.fit(X_train, Y_train.iloc[:,1])

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,1], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,1], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,1], predictions)
  print("r2 = " + str(r2))

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)

mae = 4.088446837120816
rmse = 5.797863692282332
r2 = 0.5283264368278664
mae = 3.6141965427392138
rmse = 4.73885627871843
r2 = 0.5335489790255015
mae = 3.835889803040061
rmse = 5.315162263945344
r2 = 0.5478396554592013
mae = 4.047252258914376
rmse = 5.449752110565259
r2 = 0.49678001021878015
mae = 3.6016125671990036
rmse = 4.531055919374743
r2 = 0.559138686882003
mae = 3.804221823874071
rmse = 5.202142362924167
r2 = 0.5445559305967149
mae = 3.769452390462039
rmse = 5.071785547094471
r2 = 0.4467659865243532
mae = 4.035929657533678
rmse = 5.638784449108584
r2 = 0.4669763229590569
mae = 3.7429687960237277
rmse = 5.295950841491278
r2 = 0.5514653130416978
mae = 3.7909192666977862
rmse = 5.149448169571302
r2 = 0.5342776313556789
mae = 3.869410888843532
rmse = 5.238256956597193
r2 = 0.5657211340398332
mae = 3.8506983938000627
rmse = 5.352076188917475
r2 = 0.3968102980660717
mae = 4.118411478298166
rmse = 5.6803408458143085
r2 = 0.4967571311378245
mae = 3.6681259048467747
rmse = 4.898803118775

In [7]:
averageMAE = sum(allMAEs)/(OUTER_FOLDS * N_REPEATS)
averageRMSE = sum(allRMSEs)/(OUTER_FOLDS * N_REPEATS)
averageR2 = sum(allR2s)/(OUTER_FOLDS * N_REPEATS)


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 7

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Gradient Boosting Regression Suicides", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)