## Ridge Regression

### Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy.stats as stats
import math

!pip install sklearn
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

You should consider upgrading via the '/homes/iws/bhimar/cse481ds-mental-health/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
X_train_full = pd.read_csv("X_train.csv")
X_train_full.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,...,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.847,0.096,0.568,4885.0,60735.0,54.808333,26.6,36.3,44.2,55.8,...,11.01,5.75,6.53,2.99,2.16,5.3,3.1,5.95,4.9,1.0
1,0.78,0.066812,0.547429,954.495385,42945.0,74.058333,59.9,61.5,71.2,72.5,...,1.64,7.55,8.57,3.95,6.58,8.25,3.81,0.24,1.01,1.0
2,0.78,0.069824,0.453978,2573.647059,40994.0,60.825,38.5,42.9,55.2,59.1,...,2.69,3.51,2.22,3.8,5.73,0.96,0.54,1.99,3.87,6.0
3,0.866534,0.059,0.785187,4209.193548,81586.0,43.408333,16.3,17.3,24.8,39.0,...,5.51,5.31,5.99,3.51,1.34,1.62,3.07,0.53,1.26,1.0
4,0.806995,0.102,0.681151,4158.038961,43863.0,60.991667,46.3,44.2,46.8,61.4,...,4.63,3.1,10.77,8.4,5.02,1.72,1.12,2.85,4.85,2.0


In [3]:
Y_train_full = pd.read_csv("Y_train.csv")
Y_train_full.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,4.1,16.6
1,4.2,11.0
2,3.9,24.9
3,2.3,8.7
4,4.0,16.8


### Ridge regression for predicting Poor Mental Health Days

In [4]:
OUTER_FOLDS = 10
INNER_FOLDS = 10
N_REPEATS = 3

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=N_REPEATS, random_state=1)
counter = 0
# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]
  
  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = RidgeCV(cv=cv)
    
  # col 0 is Poor Mental Health Days
  model.fit(X_train, Y_train.iloc[:,0]) 
  print('alpha: %f' % model.alpha_)

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,0], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,0], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,0], predictions)
  print("r2 = " + str(r2))

  coeffs = model.coef_
  important_variables = [i for i, e in enumerate(coeffs) if e != 0]
  for i in important_variables:
      print(X_train.columns[i])
      print(coeffs[i])

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)
  counter+=1
    
print(counter)



  

alpha: 0.100000
mae = 0.3917975120656431
rmse = 0.5393707546023503
r2 = 0.31751082699536637
High school graduation raw value
0.3035649634632213
Unemployment raw value
0.8997824547219453
Some college raw value
-1.75880147025348
Ratio of population to mental health providers
1.3199219108217311e-06
Median household income raw value
-1.6966496108871323e-05
Average Temperature
0.00014837538357584615
January Average Temperature
-0.014984061182870939
February Average Temperature
7.303921517616728e-05
March Average Temperature
0.014264601693184789
April Average Temperature
-0.01767227204183472
May Average Temperature
-0.017581896820599508
June Average Temperature
0.011784797948623832
July Average Temperature
-0.01140762973813638
August Average Temperature
-0.004480706330744462
September Average Temperature
0.012487293590886054
October Average Temperature
0.016370610769243343
November Average Temperautre
0.004079559367858097
December Average Temperature
0.008847168238170552
Average Precipitatio

alpha: 0.100000
mae = 0.3850066458063984
rmse = 0.5030545679908981
r2 = 0.3919168959204924
High school graduation raw value
0.3631716656100526
Unemployment raw value
0.8031886868558981
Some college raw value
-1.7240704041895485
Ratio of population to mental health providers
1.4140303124583916e-06
Median household income raw value
-1.7065907924811342e-05
Average Temperature
0.0001982059529935603
January Average Temperature
-0.013338697048556378
February Average Temperature
0.0002075623130766747
March Average Temperature
0.012593607262223088
April Average Temperature
-0.01724826922409103
May Average Temperature
-0.015003507804929163
June Average Temperature
0.010761885483855276
July Average Temperature
-0.010418565721914307
August Average Temperature
-0.00752099438502761
September Average Temperature
0.014258551653285741
October Average Temperature
0.01429815355459252
November Average Temperautre
0.0056952218569981125
December Average Temperature
0.008093523224684073
Average Precipitatio

alpha: 0.100000
mae = 0.38587850044021055
rmse = 0.5100435265756359
r2 = 0.3910893766648551
High school graduation raw value
0.32005537789548766
Unemployment raw value
1.1721449333364145
Some college raw value
-1.6977664493538989
Ratio of population to mental health providers
1.2799627711801675e-06
Median household income raw value
-1.7285083142341307e-05
Average Temperature
8.044974557670831e-05
January Average Temperature
-0.013660675731562063
February Average Temperature
-0.0009402420700205601
March Average Temperature
0.012242257708080583
April Average Temperature
-0.011469550232472994
May Average Temperature
-0.015650590573433723
June Average Temperature
0.011638244469197003
July Average Temperature
-0.0124171980168732
August Average Temperature
-0.0044767290321839626
September Average Temperature
0.009268254915170063
October Average Temperature
0.015232196218711651
November Average Temperautre
0.0042593577109493585
December Average Temperature
0.00694007190243997
Average Precipit

alpha: 0.100000
mae = 0.3754126352792847
rmse = 0.5037761352160092
r2 = 0.3789917982651022
High school graduation raw value
0.37390066652013126
Unemployment raw value
1.1854326144023861
Some college raw value
-1.7572368296281922
Ratio of population to mental health providers
1.6237184745592676e-06
Median household income raw value
-1.705508910987576e-05
Average Temperature
0.00011997128899401449
January Average Temperature
-0.017405432953628477
February Average Temperature
0.0029471433529078233
March Average Temperature
0.01267875358595469
April Average Temperature
-0.015799328288996613
May Average Temperature
-0.015514060996000181
June Average Temperature
0.013770483701001337
July Average Temperature
-0.011690170886731983
August Average Temperature
-0.00849359338546461
September Average Temperature
0.015584877717073842
October Average Temperature
0.0132692662892102
November Average Temperautre
0.0035833597094910375
December Average Temperature
0.008508357914438517
Average Precipitatio

alpha: 1.000000
mae = 0.3976869867561441
rmse = 0.518392111755443
r2 = 0.35218298232521517
High school graduation raw value
0.271304885234028
Unemployment raw value
0.6466209456332193
Some college raw value
-1.672674707396699
Ratio of population to mental health providers
1.1457043294987887e-06
Median household income raw value
-1.742586606625554e-05
Average Temperature
8.671531498000444e-05
January Average Temperature
-0.01634083804460826
February Average Temperature
0.002970836819780497
March Average Temperature
0.012247800483668234
April Average Temperature
-0.015235841255813645
May Average Temperature
-0.014315215971963571
June Average Temperature
0.011550285614650905
July Average Temperature
-0.011368954353553958
August Average Temperature
-0.007080006086844199
September Average Temperature
0.01315830498257901
October Average Temperature
0.01170433829331059
November Average Temperautre
0.004900907703072536
December Average Temperature
0.00884896561938481
Average Precipitation
0.00

alpha: 0.100000
mae = 0.39991993007
rmse = 0.5319638402471205
r2 = 0.3608113213578219
High school graduation raw value
0.35861886590369374
Unemployment raw value
0.9644242386022202
Some college raw value
-1.7573841985010623
Ratio of population to mental health providers
1.987012379334997e-06
Median household income raw value
-1.7124945274530137e-05
Average Temperature
8.680960089918626e-05
January Average Temperature
-0.014032996954567488
February Average Temperature
-0.0009120403432869933
March Average Temperature
0.015085103146636601
April Average Temperature
-0.015403134306485716
May Average Temperature
-0.015761993077965417
June Average Temperature
0.012596925483305307
July Average Temperature
-0.01541047137897975
August Average Temperature
-0.0010361343169745774
September Average Temperature
0.00909642838632278
October Average Temperature
0.014134645023311168
November Average Temperautre
0.00504744775276035
December Average Temperature
0.0076379358093420485
Average Precipitation
0

In [5]:
averageMAE = sum(allMAEs)/(OUTER_FOLDS * N_REPEATS)
averageRMSE = sum(allRMSEs)/(OUTER_FOLDS * N_REPEATS)
averageR2 = sum(allR2s)/(OUTER_FOLDS * N_REPEATS)


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 2

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Ridge Regression Poor Mental Health Days", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)

### Ridge regression for predicting Suicides

In [6]:
OUTER_FOLDS = 10
INNER_FOLDS = 10
N_REPEATS = 3

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=N_REPEATS, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]

  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = RidgeCV(cv=cv)
  # col 1 is Suicides
  model.fit(X_train, Y_train.iloc[:,1])
  print('alpha: %f' % model.alpha_)

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,1], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,1], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,1], predictions)
  print("r2 = " + str(r2))

  coeffs = model.coef_
  important_variables = [i for i, e in enumerate(coeffs) if e != 0]
  for i in important_variables:
      print(X_train.columns[i])
      print(coeffs[i])

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)

alpha: 0.100000
mae = 4.450763466351441
rmse = 6.495439162319515
r2 = 0.4079988201558937
High school graduation raw value
4.714946984734789
Unemployment raw value
-20.784476023455053
Some college raw value
-11.673370580488026
Ratio of population to mental health providers
3.855412790697228e-05
Median household income raw value
-3.8783527079608133e-05
Average Temperature
-0.022249863914550725
January Average Temperature
0.05339579105115446
February Average Temperature
0.011719270802026128
March Average Temperature
0.13944324601807828
April Average Temperature
0.1656648550523951
May Average Temperature
-0.6825607437234825
June Average Temperature
0.4594548986589494
July Average Temperature
-0.19098852136382302
August Average Temperature
-0.23086851556399682
September Average Temperature
0.23786851188437141
October Average Temperature
-0.2724034516543528
November Average Temperautre
-0.005047808201115043
December Average Temperature
0.04732410683156239
Average Precipitation
0.002360874473

alpha: 0.100000
mae = 4.268790687193326
rmse = 5.612644522438138
r2 = 0.32247990048561226
High school graduation raw value
2.996084415884808
Unemployment raw value
-22.708940499099654
Some college raw value
-10.676344066521938
Ratio of population to mental health providers
3.3940430717668824e-05
Median household income raw value
-4.032820486179691e-05
Average Temperature
-0.020643798472399368
January Average Temperature
0.054621510458717076
February Average Temperature
0.01864566823805155
March Average Temperature
0.13149062203288664
April Average Temperature
0.16090595120919154
May Average Temperature
-0.6833066062270838
June Average Temperature
0.4328459379919004
July Average Temperature
-0.14068050305914015
August Average Temperature
-0.27947543058495894
September Average Temperature
0.2716340782750955
October Average Temperature
-0.2789341169388135
November Average Temperautre
0.04658443069667393
December Average Temperature
0.017942881839764488
Average Precipitation
-0.00254460383

alpha: 0.100000
mae = 4.320378860753404
rmse = 5.90981995696239
r2 = 0.4552749620160347
High school graduation raw value
3.348867487763883
Unemployment raw value
-22.140960328537027
Some college raw value
-10.853165004212057
Ratio of population to mental health providers
3.9471497209947214e-05
Median household income raw value
-4.074412582423438e-05
Average Temperature
-0.02031401765329862
January Average Temperature
0.05980530080696402
February Average Temperature
0.011833019843967733
March Average Temperature
0.137771781853628
April Average Temperature
0.11443990750145139
May Average Temperature
-0.6339716864936454
June Average Temperature
0.45469564253391503
July Average Temperature
-0.15615416051365544
August Average Temperature
-0.2642901762082386
September Average Temperature
0.22177306441677025
October Average Temperature
-0.2629315774403547
November Average Temperautre
0.04386548976187867
December Average Temperature
0.02939518141614642
Average Precipitation
-0.0016124748370059

alpha: 0.100000
mae = 4.323394528325162
rmse = 6.13749104786448
r2 = 0.4136627864645269
High school graduation raw value
4.10115542680242
Unemployment raw value
-23.661699901175492
Some college raw value
-10.401508767186078
Ratio of population to mental health providers
5.100817609459071e-05
Median household income raw value
-4.773564067527718e-05
Average Temperature
-0.02079076467337029
January Average Temperature
0.05151733288705288
February Average Temperature
0.008840227155896347
March Average Temperature
0.15372546413676644
April Average Temperature
0.11053831903906876
May Average Temperature
-0.6500401547881257
June Average Temperature
0.44699128936927474
July Average Temperature
-0.17079467415674976
August Average Temperature
-0.23413230739545673
September Average Temperature
0.224435379135331
October Average Temperature
-0.28501603874512
November Average Temperautre
0.06368321939270627
December Average Temperature
0.030762766311591423
Average Precipitation
0.0004391809569002565

alpha: 0.100000
mae = 4.471576113946571
rmse = 6.098392477050204
r2 = 0.4755842074325243
High school graduation raw value
4.498227811442094
Unemployment raw value
-20.839685972741183
Some college raw value
-11.140116075285531
Ratio of population to mental health providers
3.977608334318653e-05
Median household income raw value
-4.174301301943631e-05
Average Temperature
-0.020468858691682735
January Average Temperature
0.09432895934814158
February Average Temperature
-0.03723234629992497
March Average Temperature
0.17234240125677752
April Average Temperature
0.1053726670970027
May Average Temperature
-0.6531751340198343
June Average Temperature
0.49988304086939184
July Average Temperature
-0.17164132294135767
August Average Temperature
-0.25627587430415966
September Average Temperature
0.1773932122239903
October Average Temperature
-0.2519399471282686
November Average Temperautre
0.07517954898358371
December Average Temperature
0.00013848738693985498
Average Precipitation
0.005494455791

In [7]:
averageMAE = sum(allMAEs)/(OUTER_FOLDS * N_REPEATS)
averageRMSE = sum(allRMSEs)/(OUTER_FOLDS * N_REPEATS)
averageR2 = sum(allR2s)/(OUTER_FOLDS * N_REPEATS)


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 3

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Ridge Regression Suicides", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)