## Creating a basic regression model

### Declarations

In [3]:
# Importing utilities file with library imports and helper functions
%run "utils.ipynb"

In [4]:
# Loading input pickle files
df_combined=pd.read_pickle(input_path+"\\combined_dataset_nona.pkl")
df_combined_florida=pd.read_pickle(input_path+"\\combined_dataset_nona_florida.pkl")

### Logistic Regression

In [28]:
## Import libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
)
import statsmodels.api as sm

# from sklearn.metrics import root_mean_squared_error #Doesn't work. Check the following link.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.root_mean_squared_error.html#sklearn.metrics.root_mean_squared_error

In [17]:
# Create X and y datasets
X_raw = df_combined[
    [
        "Estimate Households Total",
        "Percent Households lt 10k",
        "Estimate Households Median income (dollars)",
        "Percent Households that are Families",
        "Housing cost burden among households",
        "No broadband internet subscription among households",
        "No high school diploma among adults aged 25 years or older",
        "Persons living below 150% of the poverty level",
        "Persons of racial or ethnic minority status",
        "Single-parent households",
        "Unemployment among people 16 years and older in the labor force",
        #"Percent_Urban",
        "spend_2017",
        "spend_2018",
        "spend_2019",
        "spend_2020",
    ]
]
y = df_combined["General Health"]

In [19]:
# Using standard scaler on the X_raw dataset
scaler = StandardScaler()
X=scaler.fit_transform(X_raw)

In [74]:
#Random state not needed since linear regression is a deterministic algorithm with global optimum / minimum
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the linear regression model
linreg = LinearRegression()

# Fit the model on the training data
linreg.fit(X_train, y_train)

print ("\n\n", "*"*50, "\n\tSkLearn Linear Regression Summary below\n", "*"*50)

# Print the coefficients
print("Intercept:", linreg.intercept_)
print("Coefficients SkLearn Linear Regression:")
display(pd.DataFrame({"feature":X_raw.columns, "coef":linreg.coef_}))

print ("\n\n", "*"*50, "\n\tModel evaluation metrics below\n", "*"*50)

# Evaluate the model on the training data
linreg_score = linreg.score(X_train, y_train)
print("Linear Regression Score:", linreg_score)
print("r2_score:", r2_score(y_train, linreg.predict(X_train)))
print("rmse:", mean_squared_error(y_train, linreg.predict(X_train), squared=False))
print("MAPE:", mean_absolute_percentage_error(y_train, linreg.predict(X_train)))
print("MAE:", mean_absolute_error(y_train, linreg.predict(X_train)))


# Evaluate the model on the testing data
linreg_score = linreg.score(X_test, y_test)
print("Linear Regression Score:", linreg_score)
print("r2_score:", r2_score(y_test, linreg.predict(X_test)))
print("rmse:", mean_squared_error(y_test, linreg.predict(X_test), squared=False))
print("MAPE:", mean_absolute_percentage_error(y_test, linreg.predict(X_test)))
print("MAE:", mean_absolute_error(y_test, linreg.predict(X_test)))


# Adding a statmodel linear regression to see the p-values of the coefficients
print ("\n\n", "*"*50, "\n\tStatmodel Linear Regression Summary below\n", "*"*50)

# Add a constant column to X_train
X_train = sm.add_constant(X_train)

# Create the linear regression model
linreg = sm.OLS(y_train, X_train)

# Fit the model on the training data
linreg_results = linreg.fit()

# Print the coefficients with p-values and feature names
linreg_results.summary()



 ************************************************** 
	SkLearn Linear Regression Summary below
 **************************************************
Intercept: 16.141136067780092
Coefficients SkLearn Linear Regression:


Unnamed: 0,feature,coef
0,Estimate Households Total,0.214599
1,Percent Households lt 10k,0.175938
2,Estimate Households Median income (dollars),-0.737729
3,Percent Households that are Families,0.218539
4,Housing cost burden among households,-0.091591
5,No broadband internet subscription among house...,-0.01818
6,No high school diploma among adults aged 25 ye...,0.518831
7,Persons living below 150% of the poverty level,0.874822
8,Persons of racial or ethnic minority status,-0.010799
9,Single-parent households,0.188403




 ************************************************** 
	Model evaluation metrics below
 **************************************************
Linear Regression Score: 0.5708855431563491
r2_score: 0.5708855431563491
rmse: 1.756745627771729
MAPE: 0.08473680245771979
MAE: 1.3332858519840303
Linear Regression Score: 0.5615367913024822
r2_score: 0.5615367913024822
rmse: 1.753760836721536
MAPE: 0.08357341603278358
MAE: 1.3210211790313013


 ************************************************** 
	Statmodel Linear Regression Summary below
 **************************************************


0,1,2,3
Dep. Variable:,Mental Health,R-squared:,0.571
Model:,OLS,Adj. R-squared:,0.571
Method:,Least Squares,F-statistic:,1795.0
Date:,"Sun, 11 Feb 2024",Prob (F-statistic):,0.0
Time:,22:32:37,Log-Likelihood:,-40148.0
No. Observations:,20252,AIC:,80330.0
Df Residuals:,20236,BIC:,80450.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,16.1411,0.012,1306.965,0.000,16.117,16.165
x1,0.2146,0.016,13.524,0.000,0.183,0.246
x2,0.1759,0.016,10.992,0.000,0.145,0.207
x3,-0.7377,0.020,-36.433,0.000,-0.777,-0.698
x4,0.2185,0.015,14.652,0.000,0.189,0.248
x5,-0.0916,0.017,-5.423,0.000,-0.125,-0.058
x6,-0.0182,0.018,-1.030,0.303,-0.053,0.016
x7,0.5188,0.018,29.216,0.000,0.484,0.554
x8,0.8748,0.023,38.769,0.000,0.831,0.919

0,1,2,3
Omnibus:,800.103,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2205.486
Skew:,0.159,Prob(JB):,0.0
Kurtosis:,4.585,Cond. No.,8.26
