## Lasso Regression

### Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy.stats as stats
import math

!pip install sklearn
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

You should consider upgrading via the '/homes/iws/bhimar/cse481ds-mental-health/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
X_train_full = pd.read_csv("X_train.csv")
X_train_full.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,...,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.847,0.096,0.568,4885.0,60735.0,54.808333,26.6,36.3,44.2,55.8,...,11.01,5.75,6.53,2.99,2.16,5.3,3.1,5.95,4.9,1.0
1,0.78,0.066812,0.547429,954.495385,42945.0,74.058333,59.9,61.5,71.2,72.5,...,1.64,7.55,8.57,3.95,6.58,8.25,3.81,0.24,1.01,1.0
2,0.78,0.069824,0.453978,2573.647059,40994.0,60.825,38.5,42.9,55.2,59.1,...,2.69,3.51,2.22,3.8,5.73,0.96,0.54,1.99,3.87,6.0
3,0.866534,0.059,0.785187,4209.193548,81586.0,43.408333,16.3,17.3,24.8,39.0,...,5.51,5.31,5.99,3.51,1.34,1.62,3.07,0.53,1.26,1.0
4,0.806995,0.102,0.681151,4158.038961,43863.0,60.991667,46.3,44.2,46.8,61.4,...,4.63,3.1,10.77,8.4,5.02,1.72,1.12,2.85,4.85,2.0


In [3]:
Y_train_full = pd.read_csv("Y_train.csv")
Y_train_full.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,4.1,16.6
1,4.2,11.0
2,3.9,24.9
3,2.3,8.7
4,4.0,16.8


### Lasso regression for predicting Poor Mental Health Days

In [5]:
OUTER_FOLDS = 10
INNER_FOLDS = 10

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=3, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]
  
  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = LassoCV(cv=cv, n_jobs=-1)
    
  # col 0 is Poor Mental Health Days
  model.fit(X_train, Y_train.iloc[:,0]) 
  print('alpha: %f' % model.alpha_)

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,0], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,0], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,0], predictions)
  print("r2 = " + str(r2))

  coeffs = model.coef_
  important_variables = [i for i, e in enumerate(coeffs) if e != 0]
  for i in important_variables:
      print(X_train.columns[i])
      print(coeffs[i])

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)



  

alpha: 4.257934
mae = 0.42078332995226914
rmse = 0.5642740432376391
r2 = 0.2530335037790824
Ratio of population to mental health providers
2.4141227287201535e-06
Median household income raw value
-2.5285453234870454e-05
alpha: 4.229743
mae = 0.45673444209229347
rmse = 0.5760028388924171
r2 = 0.21197319190625086
Ratio of population to mental health providers
2.901759355299314e-06
Median household income raw value
-2.5737553204539938e-05
alpha: 4.170363
mae = 0.4337646014506057
rmse = 0.5778039757020885
r2 = 0.28468474353602713
Ratio of population to mental health providers
2.579373332422691e-06
Median household income raw value
-2.517477087898092e-05
alpha: 4.228331
mae = 0.44095734099906175
rmse = 0.5814336066323523
r2 = 0.2403522503611023
Ratio of population to mental health providers
3.2053376060493993e-06
Median household income raw value
-2.534961736223822e-05
alpha: 4.307727
mae = 0.4359798083956682
rmse = 0.5725259834938803
r2 = 0.20608729870211062
Ratio of population to mental h

In [None]:
averageMAE = sum(allMAEs)/OUTER_FOLDS
averageRMSE = sum(allRMSEs)/OUTER_FOLDS
averageR2 = sum(allR2s)/OUTER_FOLDS


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 0

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Lasso Regression Poor Mental Health Days", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)

### Lasso regression for predicting Suicides

In [6]:
OUTER_FOLDS = 10
INNER_FOLDS = 10

allMAEs = []
allRMSEs = []
allR2s = []

avgKFolds = RepeatedKFold(n_splits=OUTER_FOLDS, n_repeats=3, random_state=1)

# outer cross validation is to get the average metrics across random folds in
# order to select best model to use
for train_index, test_index in avgKFolds.split(X_train_full):
  X_train, X_val = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
  Y_train, Y_val = Y_train_full.iloc[train_index], Y_train_full.iloc[test_index]

  # inner cross validation is for the best hyperparameter
  cv = RepeatedKFold(n_splits=INNER_FOLDS, n_repeats=3, random_state=1)
  model = LassoCV(cv=cv, n_jobs=-1)
  # col 1 is Suicides
  model.fit(X_train, Y_train.iloc[:,1])
  print('alpha: %f' % model.alpha_)

  predictions = model.predict(X_val)
  mae = mean_absolute_error(Y_val.iloc[:,1], predictions)
  print("mae = " + str(mae))
  rmse = math.sqrt(mean_squared_error(Y_val.iloc[:,1], predictions))
  print("rmse = " + str(rmse))
  r2 = r2_score(Y_val.iloc[:,1], predictions)
  print("r2 = " + str(r2))

  coeffs = model.coef_
  important_variables = [i for i, e in enumerate(coeffs) if e != 0]
  for i in important_variables:
      print(X_train.columns[i])
      print(coeffs[i])

  allMAEs.append(mae)
  allRMSEs.append(rmse)
  allR2s.append(r2)

alpha: 32.202110
mae = 5.344130225919546
rmse = 7.869498863448656
r2 = 0.13104019622859098
Ratio of population to mental health providers
2.3499368272040975e-05
Median household income raw value
-0.00019054155298122653
alpha: 33.488070
mae = 4.902388335893626
rmse = 6.772917683773868
r2 = 0.04718065132471183
Ratio of population to mental health providers
1.7939737695327396e-05
Median household income raw value
-0.00020440737302322706
alpha: 32.328168
mae = 5.3286743859718735
rmse = 7.39416447306771
r2 = 0.12494040838146803
Ratio of population to mental health providers
2.489316682960349e-05
Median household income raw value
-0.00019453314112782616
alpha: 32.582330
mae = 5.174445915917177
rmse = 7.195026301461517
r2 = 0.12285950863617712
Ratio of population to mental health providers
2.2342738712038202e-05
Median household income raw value
-0.00019564030773899546
alpha: 33.341173
mae = 5.010513456839152
rmse = 6.451539514690966
r2 = 0.10622157063935633
Ratio of population to mental heal

In [7]:
averageMAE = sum(allMAEs)/OUTER_FOLDS
averageRMSE = sum(allRMSEs)/OUTER_FOLDS
averageR2 = sum(allR2s)/OUTER_FOLDS


# Make sure to change this model number for each of the different models. It
# just dictates which row in the csv to put the data. 
model_number = 1

full_metrics_df = pd.read_csv("All_Model_Metrics.csv")
full_metrics_df.loc[model_number] = ["Lasso Regression Suicides", averageMAE, averageRMSE, averageR2]
full_metrics_df.to_csv('All_Model_Metrics.csv', index=False)