# Demographic Health Prediction (Linear Regression)

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from pickle import dump

In [8]:
# Load cleaned data
df = pd.read_csv('../data/processed/demographic_health_processed.csv')
df.sample(10)

Unnamed: 0,unemployment_rate_2018,%_white-alone,%_black-alone,r_death_2018,r_natural_inc_2018,percent_of_adults_with_less_than_a_high_school_diploma_2014-18,percent_of_adults_with_a_high_school_diploma_only_2014-18,percent_of_adults_with_a_bachelor's_degree_or_higher_2014-18,pctpovall_2018,pctpov017_2018,pctpov517_2018,medhhinc_2018,med_hh_income_percent_of_state_total_2018,active_primary_care_physicians_per_100000_population_2018_(aamc),active_patient_care_primary_care_physicians_per_100000_population_2018_(aamc),diabetes_prevalence
2011,-1.019007,0.752486,-0.619962,0.236206,-0.197785,-0.001623,0.252271,-0.548417,-0.368536,-0.360007,-0.328462,-0.188848,-0.626252,-0.076298,-0.045207,12.6
2906,0.178778,0.624549,-0.422349,1.910865,-1.677052,0.850537,0.28008,-0.665031,0.495937,0.336004,0.298165,-0.768967,-1.559656,0.186315,0.155059,13.5
2851,-1.019007,0.379235,-0.327318,-0.843018,0.615811,-0.317238,-0.234382,0.691933,-1.330875,-1.269311,-1.205741,1.791839,0.873153,0.186315,0.155059,10.4
1884,0.04569,0.553125,-0.431607,-0.284799,-0.099167,-0.443484,0.043706,0.077059,-0.906794,-0.730464,-0.784191,0.600536,0.044012,1.984816,1.404341,12.3
378,-0.287028,-0.022259,0.237686,1.017714,-0.88811,0.803195,0.697212,-0.728639,0.675357,0.796269,0.867826,-1.024185,-0.98869,-0.092214,0.002475,14.5
3111,-1.085551,0.674969,-0.553125,-0.656945,0.024105,-1.295644,-0.554182,0.967567,-1.738646,-1.819384,-1.775402,1.656284,1.751944,0.528508,0.603275,8.8
1630,0.04569,0.686231,-0.645373,-3.113111,0.295304,-1.926874,0.864064,0.787345,0.169721,1.166726,0.856433,-0.845554,-0.755339,0.281811,0.755859,11.3
2994,1.310019,0.726506,-0.543633,1.017714,-1.331889,-0.727537,1.030917,-0.230379,-0.23805,-0.180391,-0.123384,-0.318219,1.012171,0.966196,0.88937,15.0
566,1.70928,0.554622,-0.598835,0.496709,-0.986727,-0.459265,0.071515,-0.495411,0.055545,0.23497,0.17284,-0.456651,-0.288637,-1.118791,-1.065613,14.0
456,-0.486658,-0.297821,0.510601,-0.13594,-0.616911,-0.664414,-1.49968,0.882756,-1.053591,-0.932532,-0.88673,1.576245,1.885996,-0.704977,-0.779518,12.4


In [41]:
# Machine Learning
df_train, df_test = train_test_split(df, test_size=0.2, random_state=2025)

df_train.shape, df_test.shape

((2512, 16), (628, 16))

In [42]:
X_train = df_train.drop(columns='diabetes_prevalence')
y_train = df_train['diabetes_prevalence']

X_test = df_test.drop(columns='diabetes_prevalence')
y_test = df_test['diabetes_prevalence']

X_train.shape

(2512, 15)

In [43]:
lr = LinearRegression()
lr.fit(X_train, y_train) # Training

y_hat_lr = lr.predict(X_test)

print(f'MSE: {mean_squared_error(y_test, y_hat_lr)}') #Prueba, prediccion
print(f'MAE: {mean_absolute_error(y_test, y_hat_lr)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_hat_lr))}')
print(f'R2 score: {r2_score(y_test, y_hat_lr)}')

MSE: 1.0391002580644093
MAE: 0.7707929821652741
RMSE: 1.0193626724892417
R2 score: 0.854826060590048


In [49]:
lasso = Lasso(alpha=0.2, max_iter=1000, random_state=2025)
lasso.fit(X_train, y_train) # Training

y_hat_lasso = lasso.predict(X_test)

print(f'MSE: {mean_squared_error(y_test, y_hat_lasso)}') #Prueba, prediccion
print(f'MAE: {mean_absolute_error(y_test, y_hat_lasso)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_hat_lasso))}')
print(f'R2 score: {r2_score(y_test, y_hat_lasso)}')

MSE: 1.269097260041069
MAE: 0.8606670106366467
RMSE: 1.126542169668348
R2 score: 0.8226929044578122


In [45]:
ridge = Ridge(alpha=0.3, max_iter=10000, random_state=2025)
ridge.fit(X_train, y_train) # Training

y_hat_ridge = ridge.predict(X_test)

print(f'MSE: {mean_squared_error(y_test, y_hat_ridge)}') #Prueba, prediccion
print(f'MAE: {mean_absolute_error(y_test, y_hat_ridge)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_hat_ridge))}')
print(f'R2 score: {r2_score(y_test, y_hat_ridge)}')

MSE: 1.0390114533393395
MAE: 0.7707924845004012
RMSE: 1.0193191126135817
R2 score: 0.854838467604363


The baseline model performs well explaining almost 85% of the variance and achieving a relatively low error. 

Lasso's model underperformed. It has higher errors than the baseline model and lower R2 score, but it is still a good model.

Finally, the ridge model had almost an identical performance as the baseline model. This suggest that colinearity was not an important issue. 

In [50]:
# Save models
dump(lr, open("../models/linear_reg_base_model.sav", "wb"))
dump(lasso, open("../models/lasso_alpha-0.2.sav", "wb"))
dump(ridge, open("../models/ridge_alpha-0.3.sav", "wb"))