In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy.stats as stats
import math

!pip install sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [2]:
X_train = pd.read_csv("X_train.csv")
X_train.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,...,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.847,0.096,0.568,4885.0,60735.0,54.808333,26.6,36.3,44.2,55.8,...,11.01,5.75,6.53,2.99,2.16,5.3,3.1,5.95,4.9,1.0
1,0.78,0.066812,0.547429,954.495385,42945.0,74.058333,59.9,61.5,71.2,72.5,...,1.64,7.55,8.57,3.95,6.58,8.25,3.81,0.24,1.01,1.0
2,0.78,0.069824,0.453978,2573.647059,40994.0,60.825,38.5,42.9,55.2,59.1,...,2.69,3.51,2.22,3.8,5.73,0.96,0.54,1.99,3.87,6.0
3,0.866534,0.059,0.785187,4209.193548,81586.0,43.408333,16.3,17.3,24.8,39.0,...,5.51,5.31,5.99,3.51,1.34,1.62,3.07,0.53,1.26,1.0
4,0.806995,0.102,0.681151,4158.038961,43863.0,60.991667,46.3,44.2,46.8,61.4,...,4.63,3.1,10.77,8.4,5.02,1.72,1.12,2.85,4.85,2.0


In [3]:
Y_train = pd.read_csv("Y_train.csv")
Y_train.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,4.1,16.6
1,4.2,11.0
2,3.9,24.9
3,2.3,8.7
4,4.0,16.8


In [4]:
X_test = pd.read_csv("X_test.csv")
X_test.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,...,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.736409,0.081,0.627313,4625.132653,48480.0,59.191667,46.0,42.8,45.3,59.2,...,6.22,6.33,8.7,12.72,7.33,3.39,1.66,4.82,7.86,2.0
1,0.854947,0.049,0.669904,1407.951923,64482.0,45.9,22.5,24.8,32.7,44.2,...,2.59,4.89,7.83,6.03,3.09,4.52,1.14,3.66,3.54,4.0
2,0.696891,0.09,0.48784,9505.125,43231.0,48.758333,30.3,36.6,42.3,45.9,...,2.16,1.9,2.07,0.21,0.47,3.09,0.87,1.7,2.0,4.0
3,0.83,0.08,0.609116,4711.34375,48733.0,50.233333,29.7,27.4,33.6,48.5,...,3.95,1.83,7.35,5.63,2.66,2.29,4.28,2.48,3.83,1.0
4,0.801523,0.071,0.619784,3613.0,41755.0,55.433333,35.8,37.4,40.3,54.5,...,4.59,3.68,6.63,2.91,3.95,0.76,4.63,2.85,5.56,3.0


In [5]:
Y_test = pd.read_csv("Y_test.csv")
Y_test.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,3.2,18.0
1,3.0,14.3
2,3.5,18.2
3,3.2,18.2
4,4.6,12.9


### Random Forest regression for predicting Poor Mental Health Days

In [6]:
import time

K_FOLDS = 10
N_REPEATS = 3

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
  
# cross validation is for the best hyperparameter
cv = RepeatedKFold(n_splits=K_FOLDS, n_repeats=N_REPEATS, random_state=1)
model = RandomForestRegressor(n_estimators = 1000, random_state = 42)

grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = cv, n_jobs = -1, verbose = 2)

tic = time.perf_counter()
# col 0 is Poor Mental Health Days
grid_search.fit(X_train, Y_train.iloc[:,0]) 

toc = time.perf_counter() 
print(f"Downloaded the tutorial in {toc - tic:0.4f} seconds")

predictions = grid_search.predict(X_test)
mae = mean_absolute_error(Y_test.iloc[:,0], predictions)
print("mae = " + str(mae))
rmse = math.sqrt(mean_squared_error(Y_test.iloc[:,0], predictions))
print("rmse = " + str(rmse))
r2 = r2_score(Y_test.iloc[:,0], predictions)
print("r2 = " + str(r2))

Fitting 30 folds for each of 288 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.6s


KeyboardInterrupt: 

### Gradient Boosting Regression for predicting Suicides

In [8]:
K_FOLDS = 10
N_REPEATS = 3

# cross validation is for the best hyperparameter
cv = RepeatedKFold(n_splits=K_FOLDS, n_repeats=N_REPEATS, random_state=1)
model = GradientBoostingRegressor()
# col 1 is Suicides
model.fit(X_train, Y_train.iloc[:,1])

predictions = model.predict(X_test)
mae = mean_absolute_error(Y_test.iloc[:,1], predictions)
print("mae = " + str(mae))
rmse = math.sqrt(mean_squared_error(Y_test.iloc[:,1], predictions))
print("rmse = " + str(rmse))
r2 = r2_score(Y_test.iloc[:,1], predictions)
print("r2 = " + str(r2))

mae = 3.823246472772097
rmse = 5.220407703039061
r2 = 0.44950120818531136
