## Ridge Regression

### Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy.stats as stats
import math

!pip install sklearn
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [2]:
X_train_full = pd.read_csv("X_train.csv")
X_train_full.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,May Average Temperature,June Average Temperature,July Average Temperature,August Average Temperature,September Average Temperature,October Average Temperature,November Average Temperautre,December Average Temperature,Average Precipitation,January Average Precipitation,February Average Precipitation,March Average Precipitation,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.847,0.096,0.568,4885.0,60735.0,54.808333,26.6,36.3,44.2,55.8,62.7,72.3,79.2,74.2,65.1,53.8,48.4,39.1,4.859167,1.18,5.42,4.02,11.01,5.75,6.53,2.99,2.16,5.3,3.1,5.95,4.9,1.0
1,0.78,0.066812,0.547429,954.495385,42945.0,74.058333,59.9,61.5,71.2,72.5,77.2,82.4,84.6,83.6,81.9,76.1,68.5,69.3,4.5725,7.38,2.43,3.46,1.64,7.55,8.57,3.95,6.58,8.25,3.81,0.24,1.01,1.0
2,0.78,0.069824,0.453978,2573.647059,40994.0,60.825,38.5,42.9,55.2,59.1,65.9,75.7,79.2,77.6,74.3,63.2,53.7,44.6,3.104167,5.07,5.22,1.65,2.69,3.51,2.22,3.8,5.73,0.96,0.54,1.99,3.87,6.0
3,0.866534,0.059,0.785187,4209.193548,81586.0,43.408333,16.3,17.3,24.8,39.0,56.3,67.0,72.4,71.4,65.3,48.0,31.4,11.7,2.673333,0.83,1.32,1.79,5.51,5.31,5.99,3.51,1.34,1.62,3.07,0.53,1.26,1.0
4,0.806995,0.102,0.681151,4158.038961,43863.0,60.991667,46.3,44.2,46.8,61.4,67.8,76.6,79.3,77.1,71.8,62.5,50.0,48.1,4.201667,1.91,4.31,1.74,4.63,3.1,10.77,8.4,5.02,1.72,1.12,2.85,4.85,2.0


In [3]:
Y_train_full = pd.read_csv("Y_train.csv")
Y_train_full.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,4.1,16.6
1,4.2,11.0
2,3.9,24.9
3,2.3,8.7
4,4.0,16.8


### Ridge regression for predicting Poor Mental Health Days

In [4]:
OUTER_FOLDS = 10
INNER_FOLDS = 10

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=3, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]
  
  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = RidgeCV(cv=cv)
    
  # col 0 is Poor Mental Health Days
  model.fit(X_train, Y_train.iloc[:,0]) 
  print('alpha: %f' % model.alpha_)

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,0], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,0], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,0], predictions)
  print("r2 = " + str(r2))

  coeffs = model.coef_
  important_variables = [i for i, e in enumerate(coeffs) if e != 0]
  for i in important_variables:
      print(X_train.columns[i])
      print(coeffs[i])

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)



  

alpha: 0.100000
mae = 0.3917975120656432
rmse = 0.5393707546023503
r2 = 0.31751082699536637
High school graduation raw value
0.3035649634632212
Unemployment raw value
0.8997824547219458
Some college raw value
-1.7588014702534795
Ratio of population to mental health providers
1.3199219108217352e-06
Median household income raw value
-1.6966496108871327e-05
Average Temperature
0.00014837537581248626
January Average Temperature
-0.014984061182223925
February Average Temperature
7.303921582305844e-05
March Average Temperature
0.014264601693831756
April Average Temperature
-0.017672272041187893
May Average Temperature
-0.01758189681995252
June Average Temperature
0.011784797949270858
July Average Temperature
-0.011407629737489423
August Average Temperature
-0.004480706330097518
September Average Temperature
0.01248729359153286
October Average Temperature
0.016370610769890333
November Average Temperautre
0.004079559368505149
December Average Temperature
0.008847168238817463
Average Precipitat

In [5]:
averageMAE = sum(allMAEs)/OUTER_FOLDS
averageRMSE = sum(allRMSEs)/OUTER_FOLDS
averageR2 = sum(allR2s)/OUTER_FOLDS


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 2

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Ridge Regression Poor Mental Health Days", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)

### Ridge regression for predicting Suicides

In [6]:
OUTER_FOLDS = 10
INNER_FOLDS = 10

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=3, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]

  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = RidgeCV(cv=cv)
  # col 1 is Suicides
  model.fit(X_train, Y_train.iloc[:,1])
  print('alpha: %f' % model.alpha_)

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,1], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,1], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,1], predictions)
  print("r2 = " + str(r2))

  coeffs = model.coef_
  important_variables = [i for i, e in enumerate(coeffs) if e != 0]
  for i in important_variables:
      print(X_train.columns[i])
      print(coeffs[i])

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)

alpha: 0.100000
mae = 4.450763466351441
rmse = 6.495439162319515
r2 = 0.4079988201558937
High school graduation raw value
4.714946984734779
Unemployment raw value
-20.784476023455078
Some college raw value
-11.673370580488017
Ratio of population to mental health providers
3.855412790697232e-05
Median household income raw value
-3.87835270796081e-05
Average Temperature
-0.022249864071417984
January Average Temperature
0.05339579106422671
February Average Temperature
0.011719270815098319
March Average Temperature
0.13944324603115008
April Average Temperature
0.1656648550654662
May Average Temperature
-0.6825607437104079
June Average Temperature
0.45945489867202144
July Average Temperature
-0.19098852135075153
August Average Temperature
-0.23086851555092344
September Average Temperature
0.237868511897441
October Average Temperature
-0.27240345164127844
November Average Temperautre
-0.005047808188042604
December Average Temperature
0.047324106844634735
Average Precipitation
0.0023608744762

In [7]:
averageMAE = sum(allMAEs)/OUTER_FOLDS
averageRMSE = sum(allRMSEs)/OUTER_FOLDS
averageR2 = sum(allR2s)/OUTER_FOLDS


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 3

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Ridge Regression Suicides", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)