## Lasso Regression

### Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy.stats as stats
import math

!pip install sklearn
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error

You should consider upgrading via the '/homes/iws/bhimar/cse481ds-mental-health/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [10]:
X_train_full = pd.read_csv("X_train.csv")
X_train_full.head(5)

Unnamed: 0,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,February Average Temperature,March Average Temperature,April Average Temperature,...,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,0.847,0.096,0.568,4885.0,60735.0,54.808333,26.6,36.3,44.2,55.8,...,11.01,5.75,6.53,2.99,2.16,5.3,3.1,5.95,4.9,1.0
1,0.78,0.066812,0.547429,954.495385,42945.0,74.058333,59.9,61.5,71.2,72.5,...,1.64,7.55,8.57,3.95,6.58,8.25,3.81,0.24,1.01,1.0
2,0.78,0.069824,0.453978,2573.647059,40994.0,60.825,38.5,42.9,55.2,59.1,...,2.69,3.51,2.22,3.8,5.73,0.96,0.54,1.99,3.87,6.0
3,0.866534,0.059,0.785187,4209.193548,81586.0,43.408333,16.3,17.3,24.8,39.0,...,5.51,5.31,5.99,3.51,1.34,1.62,3.07,0.53,1.26,1.0
4,0.806995,0.102,0.681151,4158.038961,43863.0,60.991667,46.3,44.2,46.8,61.4,...,4.63,3.1,10.77,8.4,5.02,1.72,1.12,2.85,4.85,2.0


In [11]:
Y_train_full = pd.read_csv("Y_train.csv")
Y_train_full.head(5)

Unnamed: 0,Poor mental health days raw value,Crude Rate
0,4.1,16.6
1,4.2,11.0
2,3.9,24.9
3,2.3,8.7
4,4.0,16.8


In [12]:
identifying_fields = ['State Abbreviation', 'Name', 'Release Year']
with open("factors.txt") as f:
     factors = f.read().splitlines()
with open("outcomes.txt") as f:
     outcomes = f.read().splitlines()

### Lasso regression for predicting Poor Mental Health Days

In [17]:
X_train = X_train_full[:math.ceil(X_train_full.shape[0] * 0.8)]
X_val = X_train_full[-1 * math.floor(X_train_full.shape[0] * 0.2):]

Y_train = Y_train_full[:math.ceil(Y_train_full.shape[0] * 0.8)]
Y_val = Y_train_full[-1 * math.floor(Y_train_full.shape[0] * 0.2):]

In [19]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = LassoCV(cv=cv, n_jobs=-1)
model.fit(X_train, Y_train.iloc[:,0])
print('alpha: %f' % model.alpha_)

predictions = model.predict(X_val)
mae = mean_absolute_error(Y_val.iloc[:,0], predictions)
print(mae)

coeffs = model.coef_
important_variables = [i for i, e in enumerate(coeffs) if e != 0]
for i in important_variables:
    print(factors[i])
    print(coeffs[i])

alpha: 4.384180
0.4281999077915159
Ratio of population to mental health providers
2.1860650507477655e-06
Median household income raw value
-2.587462580145863e-05
