## Random Forest Regression

### Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy.stats as stats
import math

!pip install sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [None]:
X_train_full = pd.read_csv("X_train.csv")
X_train_full.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,May Average Temperature,June Average Temperature,July Average Temperature,August Average Temperature,September Average Temperature,October Average Temperature,November Average Temperautre,December Average Temperature,Average Precipitation,January Average Precipitation,February Average Precipitation,March Average Precipitation,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.847,0.096,0.568,4885.0,60735.0,54.808333,26.6,36.3,44.2,55.8,62.7,72.3,79.2,74.2,65.1,53.8,48.4,39.1,4.859167,1.18,5.42,4.02,11.01,5.75,6.53,2.99,2.16,5.3,3.1,5.95,4.9,1.0
1,0.78,0.066812,0.547429,954.495385,42945.0,74.058333,59.9,61.5,71.2,72.5,77.2,82.4,84.6,83.6,81.9,76.1,68.5,69.3,4.5725,7.38,2.43,3.46,1.64,7.55,8.57,3.95,6.58,8.25,3.81,0.24,1.01,1.0
2,0.78,0.069824,0.453978,2573.647059,40994.0,60.825,38.5,42.9,55.2,59.1,65.9,75.7,79.2,77.6,74.3,63.2,53.7,44.6,3.104167,5.07,5.22,1.65,2.69,3.51,2.22,3.8,5.73,0.96,0.54,1.99,3.87,6.0
3,0.866534,0.059,0.785187,4209.193548,81586.0,43.408333,16.3,17.3,24.8,39.0,56.3,67.0,72.4,71.4,65.3,48.0,31.4,11.7,2.673333,0.83,1.32,1.79,5.51,5.31,5.99,3.51,1.34,1.62,3.07,0.53,1.26,1.0
4,0.806995,0.102,0.681151,4158.038961,43863.0,60.991667,46.3,44.2,46.8,61.4,67.8,76.6,79.3,77.1,71.8,62.5,50.0,48.1,4.201667,1.91,4.31,1.74,4.63,3.1,10.77,8.4,5.02,1.72,1.12,2.85,4.85,2.0


In [None]:
Y_train_full = pd.read_csv("Y_train.csv")
Y_train_full.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,4.1,16.6
1,4.2,11.0
2,3.9,24.9
3,2.3,8.7
4,4.0,16.8


### Random Forest regression for predicting Poor Mental Health Days

In [None]:
OUTER_FOLDS = 10
INNER_FOLDS = 5
N_REPEATS = 3

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=N_REPEATS, random_state=1)
counter = 0
# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]
  
  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=1, random_state=1)
  model = RandomForestRegressor(n_estimators = 1000, random_state = 42)

  grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = cv, n_jobs = -1, verbose = 2)

  # col 0 is Poor Mental Health Days
  grid_search.fit(X_train, Y_train.iloc[:,0]) 

  predictions = grid_search.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,0], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,0], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,0], predictions)
  print("r2 = " + str(r2))

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)
  counter+=1
  print(counter)
    
print(counter)



  

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
mae = 0.3725401294811723
rmse = 0.5148437867726245
r2 = 0.37816963062589204
1
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
mae = 0.37557789990190155
rmse = 0.49295848204735054
r2 = 0.4228184612534954
2
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
mae = 0.35291616471709125
rmse = 0.4817993132396631
r2 = 0.5026423645911171
3
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
mae = 0.371715577263909
rmse = 0.5050801048232525
r2 = 0.42676523781461895
4
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [None]:
averageMAE = sum(allMAEs)/(OUTER_FOLDS * N_REPEATS)
averageRMSE = sum(allRMSEs)/(OUTER_FOLDS * N_REPEATS)
averageR2 = sum(allR2s)/(OUTER_FOLDS * N_REPEATS)


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 8

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Random Forest Regression Poor Mental Health Days", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)

### Random Forest regression for predicting Suicides

In [None]:
OUTER_FOLDS = 10
INNER_FOLDS = 10
N_REPEATS = 3

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=N_REPEATS, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]

  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = RidgeCV(cv=cv)
  # col 1 is Suicides
  model.fit(X_train, Y_train.iloc[:,1])
  print('alpha: %f' % model.alpha_)

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,1], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,1], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,1], predictions)
  print("r2 = " + str(r2))

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)

alpha: 0.100000
mae = 4.450763466351441
rmse = 6.495439162319515
r2 = 0.4079988201558937
High school graduation raw value
4.714946984734789
Unemployment raw value
-20.784476023455053
Some college raw value
-11.673370580488026
Ratio of population to mental health providers
3.855412790697228e-05
Median household income raw value
-3.8783527079608133e-05
Average Temperature
-0.022249863914550725
January Average Temperature
0.05339579105115446
February Average Temperature
0.011719270802026128
March Average Temperature
0.13944324601807828
April Average Temperature
0.1656648550523951
May Average Temperature
-0.6825607437234825
June Average Temperature
0.4594548986589494
July Average Temperature
-0.19098852136382302
August Average Temperature
-0.23086851556399682
September Average Temperature
0.23786851188437141
October Average Temperature
-0.2724034516543528
November Average Temperautre
-0.005047808201115043
December Average Temperature
0.04732410683156239
Average Precipitation
0.002360874473

alpha: 0.100000
mae = 4.268790687193326
rmse = 5.612644522438138
r2 = 0.32247990048561226
High school graduation raw value
2.996084415884808
Unemployment raw value
-22.708940499099654
Some college raw value
-10.676344066521938
Ratio of population to mental health providers
3.3940430717668824e-05
Median household income raw value
-4.032820486179691e-05
Average Temperature
-0.020643798472399368
January Average Temperature
0.054621510458717076
February Average Temperature
0.01864566823805155
March Average Temperature
0.13149062203288664
April Average Temperature
0.16090595120919154
May Average Temperature
-0.6833066062270838
June Average Temperature
0.4328459379919004
July Average Temperature
-0.14068050305914015
August Average Temperature
-0.27947543058495894
September Average Temperature
0.2716340782750955
October Average Temperature
-0.2789341169388135
November Average Temperautre
0.04658443069667393
December Average Temperature
0.017942881839764488
Average Precipitation
-0.00254460383

alpha: 0.100000
mae = 4.320378860753404
rmse = 5.90981995696239
r2 = 0.4552749620160347
High school graduation raw value
3.348867487763883
Unemployment raw value
-22.140960328537027
Some college raw value
-10.853165004212057
Ratio of population to mental health providers
3.9471497209947214e-05
Median household income raw value
-4.074412582423438e-05
Average Temperature
-0.02031401765329862
January Average Temperature
0.05980530080696402
February Average Temperature
0.011833019843967733
March Average Temperature
0.137771781853628
April Average Temperature
0.11443990750145139
May Average Temperature
-0.6339716864936454
June Average Temperature
0.45469564253391503
July Average Temperature
-0.15615416051365544
August Average Temperature
-0.2642901762082386
September Average Temperature
0.22177306441677025
October Average Temperature
-0.2629315774403547
November Average Temperautre
0.04386548976187867
December Average Temperature
0.02939518141614642
Average Precipitation
-0.0016124748370059

alpha: 0.100000
mae = 4.323394528325162
rmse = 6.13749104786448
r2 = 0.4136627864645269
High school graduation raw value
4.10115542680242
Unemployment raw value
-23.661699901175492
Some college raw value
-10.401508767186078
Ratio of population to mental health providers
5.100817609459071e-05
Median household income raw value
-4.773564067527718e-05
Average Temperature
-0.02079076467337029
January Average Temperature
0.05151733288705288
February Average Temperature
0.008840227155896347
March Average Temperature
0.15372546413676644
April Average Temperature
0.11053831903906876
May Average Temperature
-0.6500401547881257
June Average Temperature
0.44699128936927474
July Average Temperature
-0.17079467415674976
August Average Temperature
-0.23413230739545673
September Average Temperature
0.224435379135331
October Average Temperature
-0.28501603874512
November Average Temperautre
0.06368321939270627
December Average Temperature
0.030762766311591423
Average Precipitation
0.0004391809569002565

alpha: 0.100000
mae = 4.471576113946571
rmse = 6.098392477050204
r2 = 0.4755842074325243
High school graduation raw value
4.498227811442094
Unemployment raw value
-20.839685972741183
Some college raw value
-11.140116075285531
Ratio of population to mental health providers
3.977608334318653e-05
Median household income raw value
-4.174301301943631e-05
Average Temperature
-0.020468858691682735
January Average Temperature
0.09432895934814158
February Average Temperature
-0.03723234629992497
March Average Temperature
0.17234240125677752
April Average Temperature
0.1053726670970027
May Average Temperature
-0.6531751340198343
June Average Temperature
0.49988304086939184
July Average Temperature
-0.17164132294135767
August Average Temperature
-0.25627587430415966
September Average Temperature
0.1773932122239903
October Average Temperature
-0.2519399471282686
November Average Temperautre
0.07517954898358371
December Average Temperature
0.00013848738693985498
Average Precipitation
0.005494455791

In [None]:
averageMAE = sum(allMAEs)/(OUTER_FOLDS * N_REPEATS)
averageRMSE = sum(allRMSEs)/(OUTER_FOLDS * N_REPEATS)
averageR2 = sum(allR2s)/(OUTER_FOLDS * N_REPEATS)


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 9

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Random Forest Regression Suicides", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)