## Creating a basic regression model

In [1]:
### Author: Anuvrat Chaturvedi
### Date: 13-Feb-2024
### Purpose: Creates a basic linear regression model with General health as dependent variable.

### Declarations

In [2]:
# Importing utilities file with library imports and helper functions
%run "utils.ipynb"

In [3]:
# Loading input pickle files
df_combined=pd.read_pickle(input_path+"\\combined_dataset_nona.pkl")
df_combined_florida=pd.read_pickle(input_path+"\\combined_dataset_nona_florida.pkl")

### Logistic Regression

In [4]:
## Import libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
)
import statsmodels.api as sm

# from sklearn.metrics import root_mean_squared_error #Doesn't work. Check the following link.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.root_mean_squared_error.html#sklearn.metrics.root_mean_squared_error

In [5]:
# Create X and y datasets
X_raw = df_combined[
    [
        "Estimate Households Total",
        "Percent Households lt 10k",
        "Estimate Households Median income (dollars)",
        "Percent Households that are Families",
        "Housing cost burden among households",
        "No broadband internet subscription among households",
        "No high school diploma among adults aged 25 years or older",
        "Persons living below 150% of the poverty level",
        "Persons of racial or ethnic minority status",
        "Single-parent households",
        "Unemployment among people 16 years and older in the labor force",
        #"Percent_Urban",
        "spend_2017",
        "spend_2018",
        "spend_2019",
        "spend_2020",
    ]
]
y = df_combined["General Health"]

In [6]:
# Using standard scaler on the X_raw dataset
scaler = StandardScaler()
X=scaler.fit_transform(X_raw)

In [23]:
#Random state not needed since linear regression is a deterministic algorithm with global optimum / minimum
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the linear regression model
linreg = LinearRegression()

# Fit the model on the training data
linreg.fit(X_train, y_train)

print ("\n\n", "*"*50, "\n\tSkLearn Linear Regression Summary below\n", "*"*50)

# Print the coefficients
print("Intercept:", linreg.intercept_)
print("Coefficients SkLearn Linear Regression:")
display(pd.DataFrame({"feature":X_raw.columns, "coef":linreg.coef_}))

print ("\n\n", "*"*50, "\n\tModel evaluation metrics below\n", "*"*50)

# Evaluate the model on the training data
print("Training evaluation metrics below:")
linreg_score = linreg.score(X_train, y_train)
print("Linear Regression Score:", linreg_score)
print("R2_score:", r2_score(y_train, linreg.predict(X_train)))
print("RMSE:", mean_squared_error(y_train, linreg.predict(X_train), squared=False))
print("MAPE:", mean_absolute_percentage_error(y_train, linreg.predict(X_train)))
print("MAE:", mean_absolute_error(y_train, linreg.predict(X_train)))


# Evaluate the model on the testing data
print("Testing evaluation metrics below:")
linreg_score = linreg.score(X_test, y_test)
print("Linear Regression Score:", linreg_score)
print("R2_score:", r2_score(y_test, linreg.predict(X_test)))
print("RMSE:", mean_squared_error(y_test, linreg.predict(X_test), squared=False))
print("MAPE:", mean_absolute_percentage_error(y_test, linreg.predict(X_test)))
print("MAE:", mean_absolute_error(y_test, linreg.predict(X_test)))


# Adding a statmodel linear regression to see the p-values of the coefficients
print ("\n\n", "*"*50, "\n\tStatmodel Linear Regression Summary below\n", "*"*50)

# Add a constant column to X_train
X_train = sm.add_constant(X_train)

# Create the linear regression model
linreg = sm.OLS(y_train, X_train)

# Fit the model on the training data
linreg_results = linreg.fit()

# Print the coefficients with p-values and feature names
print(linreg_results.summary())



 ************************************************** 
	SkLearn Linear Regression Summary below
 **************************************************
Intercept: 17.491004135310263
Coefficients SkLearn Linear Regression:


Unnamed: 0,feature,coef
0,Estimate Households Total,-0.344663
1,Percent Households lt 10k,0.270459
2,Estimate Households Median income (dollars),-1.592877
3,Percent Households that are Families,0.392735
4,Housing cost burden among households,-0.636716
5,No broadband internet subscription among house...,0.70032
6,No high school diploma among adults aged 25 ye...,1.33427
7,Persons living below 150% of the poverty level,1.400435
8,Persons of racial or ethnic minority status,1.069204
9,Single-parent households,0.079006




 ************************************************** 
	Model evaluation metrics below
 **************************************************
Training evaluation metrics below:
Linear Regression Score: 0.7679111097264082
R2_score: 0.7679111097264082
RMSE: 2.590346685584152
MAPE: 0.11389237419462002
MAE: 1.9222727818053935
Testing evaluation metrics below:
Linear Regression Score: 0.7642610551943769
R2_score: 0.7642610551943769
RMSE: 2.5861789991395767
MAPE: 0.11368420001915706
MAE: 1.9043673301300823


 ************************************************** 
	Statmodel Linear Regression Summary below
 **************************************************
                            OLS Regression Results                            
Dep. Variable:         General Health   R-squared:                       0.768
Model:                            OLS   Adj. R-squared:                  0.768
Method:                 Least Squares   F-statistic:                     4464.
Date:                Tue, 13 Fe

In [8]:
%watermark -u -i -d -m -v -iv -w

Last updated: 2024-02-13T00:21:25.963577-05:00

Python implementation: CPython
Python version       : 3.11.7
IPython version      : 8.20.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 158 Stepping 13, GenuineIntel
CPU cores   : 8
Architecture: 64bit

numpy      : 1.26.3
matplotlib : 3.8.0
statsmodels: 0.14.1
seaborn    : 0.13.2
plotly     : 5.9.0
pandas     : 2.1.4
re         : 2.2.1

Watermark: 2.4.3

