File for Machine Learning models

# PPOL 5203 Final Project

## Machine Learning Prediction

#### Using Risk Scores to predict health outcomes

In [None]:
os. chdir()

In [2]:
# import packages
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier

In [3]:
# import data again
NRI_county_health_data = pd.read_csv("NRI_county_health_data.csv")
#NRI_county_health_data.head()

In [4]:
# remove and replace hyphen in "age adjusted death rate" variable
NRI_county_health_data = NRI_county_health_data.rename(columns={'Age-Adjusted_Death_Rate':'Age_Adjusted_Death_Rate'})

# remove and replace hypthen in "percent non hispanic white" variable
NRI_county_health_data = NRI_county_health_data.rename(columns={'Percent_Non-Hispanic_White':'Percent_Non_Hispanic_White'})

### Age Adjusted Death Rate

In [5]:
# create binary column that indicates if states are above or below national average for Age Adjusted Death Rate

# AADR national average
AADR_mean = NRI_county_health_data['Age_Adjusted_Death_Rate'].mean()

# create column
NRI_county_health_data['Age_Adjusted_Death_Rate_mean_indicator'] = np.where(NRI_county_health_data['Age_Adjusted_Death_Rate'] > 
                                                                  AADR_mean, 1, 0)
NRI_county_health_data.head()

Unnamed: 0,FIPS,STATEABBRV,State,County,County_State,Community_Type,POPULATION,AREA,Population_Density,Percent_Rural,...,Life_Expectancy,Age_Adjusted_Death_Rate,Percent_Frequent_Physical_Distress,Percent_Frequent_Mental_Distress,Percent_Less_than_18_Years_of_Age,Percent_65_and_Over,Percent_Non_Hispanic_White,Percent_Female,Geolocation,Age_Adjusted_Death_Rate_mean_indicator
0,1001.0,AL,Alabama,Autauga,"Autauga, AL",Small Urban,58764.0,610.470508,96.260178,42.002162,...,76.58565,427.073326,10.7,16.2,23.40638,16.048735,72.454522,51.378289,POINT (-86.6464395 32.5322367),0
1,1003.0,AL,Alabama,Baldwin,"Baldwin, AL",Small Urban,231365.0,2047.738775,112.985603,42.279099,...,77.724729,381.702031,9.9,16.1,21.21825,21.469824,83.148763,51.347715,POINT (-87.7460666 30.6592183),0
2,1005.0,AL,Alabama,Barbour,"Barbour, AL",Rural,25160.0,913.646511,27.538003,67.789635,...,72.86721,589.566095,14.9,18.0,20.549591,19.976767,45.30524,46.703253,POINT (-85.4051035 31.8702531),1
3,1007.0,AL,Alabama,Bibb,"Bibb, AL",Rural,22239.0,632.486798,35.161208,68.352607,...,73.609363,567.508146,12.7,17.4,20.118343,16.737109,73.564088,46.015927,POINT (-87.1271475 33.0158929),1
4,1009.0,AL,Alabama,Blount,"Blount, AL",Small Urban,58992.0,657.196555,89.7631,89.951502,...,74.171463,532.836482,11.9,17.4,23.053471,18.72597,86.329839,50.192239,POINT (-86.56644 33.9773575),1


In [6]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Age_Adjusted_Death_Rate_mean_indicator"]

In [7]:
X.isna().sum()

# no missing values

Community_Type        0
RISK_SCORE            0
Population_Density    0
dtype: int64

In [8]:
X.dtypes

# Community_Type is an object - doesn't work

Community_Type         object
RISK_SCORE            float64
Population_Density    float64
dtype: object

In [9]:
# convert Community_Type to dummy variables
# method: one-hot coding (get dummies)
# this converts categorical data into dummy variables (as many dummies as there are unique observations)
# should create 3 dummy variables for 3 community types

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)
X

# the true/false values are dummy variables in this model

Unnamed: 0,RISK_SCORE,Population_Density,Community_Type_Rural,Community_Type_Small Urban,Community_Type_Urban
0,49.220490,96.260178,False,True,False
1,97.709195,112.985603,False,True,False
2,56.188355,27.538003,True,False,False
3,32.484887,35.161208,True,False,False
4,65.128858,89.763100,False,True,False
...,...,...,...,...,...
3138,6.617881,3.985817,True,False,False
3139,68.501432,5.457960,True,False,False
3140,21.699014,9.677509,True,False,False
3141,2.513522,3.382271,True,False,False


In [10]:
X.dtypes

# that works

RISK_SCORE                    float64
Population_Density            float64
Community_Type_Rural             bool
Community_Type_Small Urban       bool
Community_Type_Urban             bool
dtype: object

In [11]:
# split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, # is 15% okay? Or should we do 20%
                                                    random_state = 500) # I picked 500 because we have a larger datasset

In [12]:
# create model

# instantiate the modeling object
# use logistic regression because we are predicting a dummy variable
model = LogisticRegression()

# Fit the model
model.fit(X_train,y_train)

In [13]:
# predict 
y_pred = model.predict(X_test)

# calculate MSE in the test
AADR_mse = mean_squared_error(y_test, y_pred)
AADR_mse

0.4152542372881356

## Life Exptectancy

In [14]:
# create binary column that indicates if states are above or below national average for Life Expectancy

# Life Expectancy national average
Life_Expectancy_mean = NRI_county_health_data['Life_Expectancy'].mean()

# create column
NRI_county_health_data['Life_Expectancy_mean_indicator'] = np.where(NRI_county_health_data['Life_Expectancy'] > 
                                                                  Life_Expectancy_mean, 1, 0)
NRI_county_health_data.head()

Unnamed: 0,FIPS,STATEABBRV,State,County,County_State,Community_Type,POPULATION,AREA,Population_Density,Percent_Rural,...,Age_Adjusted_Death_Rate,Percent_Frequent_Physical_Distress,Percent_Frequent_Mental_Distress,Percent_Less_than_18_Years_of_Age,Percent_65_and_Over,Percent_Non_Hispanic_White,Percent_Female,Geolocation,Age_Adjusted_Death_Rate_mean_indicator,Life_Expectancy_mean_indicator
0,1001.0,AL,Alabama,Autauga,"Autauga, AL",Small Urban,58764.0,610.470508,96.260178,42.002162,...,427.073326,10.7,16.2,23.40638,16.048735,72.454522,51.378289,POINT (-86.6464395 32.5322367),0,0
1,1003.0,AL,Alabama,Baldwin,"Baldwin, AL",Small Urban,231365.0,2047.738775,112.985603,42.279099,...,381.702031,9.9,16.1,21.21825,21.469824,83.148763,51.347715,POINT (-87.7460666 30.6592183),0,1
2,1005.0,AL,Alabama,Barbour,"Barbour, AL",Rural,25160.0,913.646511,27.538003,67.789635,...,589.566095,14.9,18.0,20.549591,19.976767,45.30524,46.703253,POINT (-85.4051035 31.8702531),1,0
3,1007.0,AL,Alabama,Bibb,"Bibb, AL",Rural,22239.0,632.486798,35.161208,68.352607,...,567.508146,12.7,17.4,20.118343,16.737109,73.564088,46.015927,POINT (-87.1271475 33.0158929),1,0
4,1009.0,AL,Alabama,Blount,"Blount, AL",Small Urban,58992.0,657.196555,89.7631,89.951502,...,532.836482,11.9,17.4,23.053471,18.72597,86.329839,50.192239,POINT (-86.56644 33.9773575),1,0


In [15]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Life_Expectancy_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)
X

Unnamed: 0,RISK_SCORE,Population_Density,Community_Type_Rural,Community_Type_Small Urban,Community_Type_Urban
0,49.220490,96.260178,False,True,False
1,97.709195,112.985603,False,True,False
2,56.188355,27.538003,True,False,False
3,32.484887,35.161208,True,False,False
4,65.128858,89.763100,False,True,False
...,...,...,...,...,...
3138,6.617881,3.985817,True,False,False
3139,68.501432,5.457960,True,False,False
3140,21.699014,9.677509,True,False,False
3141,2.513522,3.382271,True,False,False


In [16]:
# split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, # is 15% okay? Or should we do 20%
                                                    random_state = 500) # I picked 500 because we have a larger datasset

In [17]:
# create model

# instantiate the modeling object
model = LogisticRegression()

# Fit the model
model.fit(X_train,y_train)

In [18]:
# predict 
y_pred = model.predict(X_test)

# calculate MSE in the test
life_expectancy_mse = mean_squared_error(y_test, y_pred)
life_expectancy_mse

# same MSE as the age adjusted death rate and I don't know why....I've tried different things and it's the same

0.4152542372881356

## Number of Physically Unhealthy Days

In [19]:
# create binary column that indicates if states are above or below national average for Average Number of Physically
# Unhealthy Days

# Physically Unhealthy Days national average
Physically_Unhealthy_Days_mean = NRI_county_health_data['Average_Number_of_Physically_Unhealthy_Days'].mean()

# create column
NRI_county_health_data['Physically_Unhealthy_Days_mean_indicator'] = np.where(NRI_county_health_data['Average_Number_of_Physically_Unhealthy_Days'] > 
                                                                  Physically_Unhealthy_Days_mean, 1, 0)
NRI_county_health_data.head()

Unnamed: 0,FIPS,STATEABBRV,State,County,County_State,Community_Type,POPULATION,AREA,Population_Density,Percent_Rural,...,Percent_Frequent_Physical_Distress,Percent_Frequent_Mental_Distress,Percent_Less_than_18_Years_of_Age,Percent_65_and_Over,Percent_Non_Hispanic_White,Percent_Female,Geolocation,Age_Adjusted_Death_Rate_mean_indicator,Life_Expectancy_mean_indicator,Physically_Unhealthy_Days_mean_indicator
0,1001.0,AL,Alabama,Autauga,"Autauga, AL",Small Urban,58764.0,610.470508,96.260178,42.002162,...,10.7,16.2,23.40638,16.048735,72.454522,51.378289,POINT (-86.6464395 32.5322367),0,0,0
1,1003.0,AL,Alabama,Baldwin,"Baldwin, AL",Small Urban,231365.0,2047.738775,112.985603,42.279099,...,9.9,16.1,21.21825,21.469824,83.148763,51.347715,POINT (-87.7460666 30.6592183),0,1,0
2,1005.0,AL,Alabama,Barbour,"Barbour, AL",Rural,25160.0,913.646511,27.538003,67.789635,...,14.9,18.0,20.549591,19.976767,45.30524,46.703253,POINT (-85.4051035 31.8702531),1,0,1
3,1007.0,AL,Alabama,Bibb,"Bibb, AL",Rural,22239.0,632.486798,35.161208,68.352607,...,12.7,17.4,20.118343,16.737109,73.564088,46.015927,POINT (-87.1271475 33.0158929),1,0,1
4,1009.0,AL,Alabama,Blount,"Blount, AL",Small Urban,58992.0,657.196555,89.7631,89.951502,...,11.9,17.4,23.053471,18.72597,86.329839,50.192239,POINT (-86.56644 33.9773575),1,0,1


In [20]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Physically_Unhealthy_Days_mean_indicator"]

In [21]:
# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [22]:
# split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, # is 15% okay? Or should we do 20%
                                                    random_state = 500) # I picked 500 because we have a larger datasset

In [23]:
# create model

# instantiate the modeling object
model_ols = LogisticRegression()

# Fit the model
model_ols.fit(X_train,y_train)

In [24]:
# predict 
y_pred = model.predict(X_test)

# calculate MSE in the test
physically_unhealthy_mse = mean_squared_error(y_test, y_pred)
physically_unhealthy_mse

0.5953389830508474

## Number of Mentally Unhealthy Days

In [25]:
# create binary column that indicates if states are above or below national average for Average Number of Mentally
# Unhealthy Days

# national average
Mentally_Unhealthy_Days_mean = NRI_county_health_data['Average_Number_of_Mentally_Unhealthy_Days'].mean()

# create column
NRI_county_health_data['Mentally_Unhealthy_Days_mean_indicator'] = np.where(NRI_county_health_data['Average_Number_of_Mentally_Unhealthy_Days'] > 
                                                                  Mentally_Unhealthy_Days_mean, 1, 0)

In [26]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Mentally_Unhealthy_Days_mean_indicator"]

In [27]:
# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [28]:
# split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, # is 15% okay? Or should we do 20%
                                                    random_state = 500) # I picked 500 because we have a larger datasset

In [29]:
# create model

# instantiate the modeling object
model = LogisticRegression()

# Fit the model
mentally_unhealthy_model = model.fit(X_train,y_train)

In [30]:
# predict 
y_pred = model.predict(X_test)

# calculate MSE in the test
mentally_unhealthy_mse = mean_squared_error(y_test, y_pred)
mentally_unhealthy_mse

0.4639830508474576

# Cross Validation Techniques

Going to use k-folds because I think it makes the most sense for this project and analysis

## Age Adjusted Death Rate

In [31]:
from sklearn.model_selection import train_test_split # Train-test split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

### K-Fold

In [32]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Age_Adjusted_Death_Rate_mean_indicator"]

In [33]:
# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [34]:
# scale the variables - may improve accuracy
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [35]:
# instantiate the modeling object
model = LogisticRegression()

In [36]:
# number of folds
k_folds = 10  

# split the data into folds
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

In [37]:
# accuracy scores of each fold
scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='accuracy')


In [38]:
print(f'Mean Accuracy: {np.mean(scores):.4f}')
print(f'Standard Deviation: {np.std(scores):.4f}')

Mean Accuracy: 0.6023
Standard Deviation: 0.0192


## Life Expectancy

### K-Fold

In [39]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Life_Expectancy_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [40]:
# scale the variables - may improve accuracy
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [41]:
# instantiate the modeling object
model = LogisticRegression()

In [42]:
# number of folds
k_folds = 10  

# split the data into folds
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

In [43]:
# accuracy scores of each fold
scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='accuracy')

In [44]:
print(f'Mean Accuracy: {np.mean(scores):.4f}')
print(f'Standard Deviation: {np.std(scores):.4f}')

Mean Accuracy: 0.5756
Standard Deviation: 0.0211


## Physically Unhealthy Days

### K-Folds

In [45]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Physically_Unhealthy_Days_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [46]:
# scale the variables - may improve accuracy
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [47]:
# instantiate the modeling object
model = LogisticRegression()

In [48]:
# number of folds
k_folds = 10  

# split the data into folds
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

In [49]:
# accuracy scores of each fold
scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='accuracy')

In [50]:
print(f'Mean Accuracy: {np.mean(scores):.4f}')
print(f'Standard Deviation: {np.std(scores):.4f}')

Mean Accuracy: 0.5937
Standard Deviation: 0.0214


## Mentally Unhealthy Days

### K Folds

In [51]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Mentally_Unhealthy_Days_mean_indicator"]


# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [52]:
# scale the variables - may improve accuracy
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [53]:
# instantiate the modeling object
model = LogisticRegression()

In [54]:
# number of folds
k_folds = 10  

# split the data into folds
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

In [55]:
# accuracy scores of each fold
scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='accuracy')

In [56]:
print(f'Mean Accuracy: {np.mean(scores):.4f}')
print(f'Standard Deviation: {np.std(scores):.4f}')

Mean Accuracy: 0.5403
Standard Deviation: 0.0344


### Analysis:

Using k-fold cross validation improved the accuracy of the models by about 10%. Still not a great predictor, but it is a lot better and we should actually be able to use the models now.

# ML Prediction Models 2.0 - Decision Trees and Risk Score Squared

In [57]:
# create risk_score_squared column

NRI_county_health_data['risk_score_squared'] = NRI_county_health_data['RISK_SCORE']**2
NRI_county_health_data.head()

Unnamed: 0,FIPS,STATEABBRV,State,County,County_State,Community_Type,POPULATION,AREA,Population_Density,Percent_Rural,...,Percent_Less_than_18_Years_of_Age,Percent_65_and_Over,Percent_Non_Hispanic_White,Percent_Female,Geolocation,Age_Adjusted_Death_Rate_mean_indicator,Life_Expectancy_mean_indicator,Physically_Unhealthy_Days_mean_indicator,Mentally_Unhealthy_Days_mean_indicator,risk_score_squared
0,1001.0,AL,Alabama,Autauga,"Autauga, AL",Small Urban,58764.0,610.470508,96.260178,42.002162,...,23.40638,16.048735,72.454522,51.378289,POINT (-86.6464395 32.5322367),0,0,0,0,2422.656634
1,1003.0,AL,Alabama,Baldwin,"Baldwin, AL",Small Urban,231365.0,2047.738775,112.985603,42.279099,...,21.21825,21.469824,83.148763,51.347715,POINT (-87.7460666 30.6592183),0,1,0,0,9547.086795
2,1005.0,AL,Alabama,Barbour,"Barbour, AL",Rural,25160.0,913.646511,27.538003,67.789635,...,20.549591,19.976767,45.30524,46.703253,POINT (-85.4051035 31.8702531),1,0,1,1,3157.131246
3,1007.0,AL,Alabama,Bibb,"Bibb, AL",Rural,22239.0,632.486798,35.161208,68.352607,...,20.118343,16.737109,73.564088,46.015927,POINT (-87.1271475 33.0158929),1,0,1,1,1055.267887
4,1009.0,AL,Alabama,Blount,"Blount, AL",Small Urban,58992.0,657.196555,89.7631,89.951502,...,23.053471,18.72597,86.329839,50.192239,POINT (-86.56644 33.9773575),1,0,1,1,4241.768116


### Age Adjusted Death Rate

#### Decision Tree

In [58]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Age_Adjusted_Death_Rate_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)
X

Unnamed: 0,RISK_SCORE,Population_Density,risk_score_squared,Community_Type_Rural,Community_Type_Small Urban,Community_Type_Urban
0,49.220490,96.260178,2422.656634,False,True,False
1,97.709195,112.985603,9547.086795,False,True,False
2,56.188355,27.538003,3157.131246,True,False,False
3,32.484887,35.161208,1055.267887,True,False,False
4,65.128858,89.763100,4241.768116,False,True,False
...,...,...,...,...,...,...
3138,6.617881,3.985817,43.796349,True,False,False
3139,68.501432,5.457960,4692.446152,True,False,False
3140,21.699014,9.677509,470.847195,True,False,False
3141,2.513522,3.382271,6.317793,True,False,False


In [59]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)


In [60]:
# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [61]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.3624801271860095

In [62]:
# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data['Natl_AADR_Predicted'] = y_pred_all

## Life Expectancy

### Decision Tree

In [63]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Life_Expectancy_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [64]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)


In [65]:
# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [66]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.3799682034976153

In [67]:
# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data['Natl_Life_Expectancy_Predicted'] = y_pred_all

In [68]:
#NRI_county_health_data.head()

## Physically Unhealthy Days

### Decision Trees

In [69]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Physically_Unhealthy_Days_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [70]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)


In [71]:
# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [72]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.39268680445151033

In [73]:
# create binary decision tree model and train it on the training set
#model = DecisionTreeClassifier(random_state=42)
#model.fit(X_train, y_train)

# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data['Natl_Physically_Unhealthy_Predicted'] = y_pred_all

## Mentally Unhealthy Days

### Decision Trees

In [74]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data[columns]

# target
y = NRI_county_health_data["Mentally_Unhealthy_Days_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=False)

In [75]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)

In [76]:
# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [77]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.3958664546899841

In [78]:
# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data['Natl_Mentally_Unhealthy_Predicted'] = y_pred_all

In [79]:
NRI_county_health_data.head(10)

Unnamed: 0,FIPS,STATEABBRV,State,County,County_State,Community_Type,POPULATION,AREA,Population_Density,Percent_Rural,...,Geolocation,Age_Adjusted_Death_Rate_mean_indicator,Life_Expectancy_mean_indicator,Physically_Unhealthy_Days_mean_indicator,Mentally_Unhealthy_Days_mean_indicator,risk_score_squared,Natl_AADR_Predicted,Natl_Life_Expectancy_Predicted,Natl_Physically_Unhealthy_Predicted,Natl_Mentally_Unhealthy_Predicted
0,1001.0,AL,Alabama,Autauga,"Autauga, AL",Small Urban,58764.0,610.470508,96.260178,42.002162,...,POINT (-86.6464395 32.5322367),0,0,0,0,2422.656634,0,1,0,1
1,1003.0,AL,Alabama,Baldwin,"Baldwin, AL",Small Urban,231365.0,2047.738775,112.985603,42.279099,...,POINT (-87.7460666 30.6592183),0,1,0,0,9547.086795,0,1,0,1
2,1005.0,AL,Alabama,Barbour,"Barbour, AL",Rural,25160.0,913.646511,27.538003,67.789635,...,POINT (-85.4051035 31.8702531),1,0,1,1,3157.131246,1,0,1,1
3,1007.0,AL,Alabama,Bibb,"Bibb, AL",Rural,22239.0,632.486798,35.161208,68.352607,...,POINT (-87.1271475 33.0158929),1,0,1,1,1055.267887,1,0,1,1
4,1009.0,AL,Alabama,Blount,"Blount, AL",Small Urban,58992.0,657.196555,89.7631,89.951502,...,POINT (-86.56644 33.9773575),1,0,1,1,4241.768116,0,1,0,1
5,1011.0,AL,Alabama,Bullock,"Bullock, AL",Rural,10326.0,631.454512,16.352722,51.374382,...,POINT (-85.7172613 32.1017589),1,0,1,1,1053.201773,1,0,1,1
6,1013.0,AL,Alabama,Butler,"Butler, AL",Rural,19015.0,785.734461,24.200288,71.232157,...,POINT (-86.6819689 31.751667),1,0,1,1,2922.119657,1,0,1,1
7,1015.0,AL,Alabama,Calhoun,"Calhoun, AL",Small Urban,116250.0,618.455143,187.968362,33.696826,...,POINT (-85.8279089 33.7705162),1,0,1,1,7119.643985,0,1,0,1
8,1017.0,AL,Alabama,Chambers,"Chambers, AL",Rural,34738.0,609.202347,57.022105,49.148034,...,POINT (-85.3940321 32.9155039),1,0,1,1,1793.359464,1,0,1,1
9,1019.0,AL,Alabama,Cherokee,"Cherokee, AL",Rural,24933.0,606.033346,41.1413,85.736273,...,POINT (-85.6542417 34.0695153),1,0,1,1,1769.188665,1,0,1,1


In [80]:
# save updated dataset
NRI_county_health_data.to_csv("NRI_county_health_data_predictions.csv", sep=',', index=False, encoding='utf-8')

## Southeast Predictions

In [81]:
NRI_county_health_data_predictions = pd.read_csv("NRI_county_health_data_predictions.csv")
#NRI_county_health_data_predictions.head()

In [82]:
# define southeast states
southeast_states = ["FL", "GA", "TN", "NC", "SC", "AL", "MS", "KY", "WV", "VA"]

# filter dataframe to only include southeast states
NRI_county_health_data_southeast = NRI_county_health_data_predictions[NRI_county_health_data_predictions['STATEABBRV'].isin(southeast_states)]
NRI_county_health_data_southeast


Unnamed: 0,FIPS,STATEABBRV,State,County,County_State,Community_Type,POPULATION,AREA,Population_Density,Percent_Rural,...,Geolocation,Age_Adjusted_Death_Rate_mean_indicator,Life_Expectancy_mean_indicator,Physically_Unhealthy_Days_mean_indicator,Mentally_Unhealthy_Days_mean_indicator,risk_score_squared,Natl_AADR_Predicted,Natl_Life_Expectancy_Predicted,Natl_Physically_Unhealthy_Predicted,Natl_Mentally_Unhealthy_Predicted
0,1001.0,AL,Alabama,Autauga,"Autauga, AL",Small Urban,58764.0,610.470508,96.260178,42.002162,...,POINT (-86.6464395 32.5322367),0,0,0,0,2422.656634,0,1,0,1
1,1003.0,AL,Alabama,Baldwin,"Baldwin, AL",Small Urban,231365.0,2047.738775,112.985603,42.279099,...,POINT (-87.7460666 30.6592183),0,1,0,0,9547.086795,0,1,0,1
2,1005.0,AL,Alabama,Barbour,"Barbour, AL",Rural,25160.0,913.646511,27.538003,67.789635,...,POINT (-85.4051035 31.8702531),1,0,1,1,3157.131246,1,0,1,1
3,1007.0,AL,Alabama,Bibb,"Bibb, AL",Rural,22239.0,632.486798,35.161208,68.352607,...,POINT (-87.1271475 33.0158929),1,0,1,1,1055.267887,1,0,1,1
4,1009.0,AL,Alabama,Blount,"Blount, AL",Small Urban,58992.0,657.196555,89.763100,89.951502,...,POINT (-86.56644 33.9773575),1,0,1,1,4241.768116,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3043,54101.0,WV,West Virginia,Webster,"Webster, WV",Rural,8359.0,561.844152,14.877791,100.000000,...,POINT (-80.4490515 38.483459),1,0,1,1,81.648445,1,0,1,1
3044,54103.0,WV,West Virginia,Wetzel,"Wetzel, WV",Rural,14420.0,364.977209,39.509316,53.856359,...,POINT (-80.6353994 39.59818),1,0,1,1,606.447429,1,0,1,1
3045,54105.0,WV,West Virginia,Wirt,"Wirt, WV",Rural,5184.0,237.135907,21.860882,100.000000,...,POINT (-81.3829749 39.0200337),1,0,1,1,4.409599,1,0,1,1
3046,54107.0,WV,West Virginia,Wood,"Wood, WV",Small Urban,84238.0,380.570013,221.346919,26.774461,...,POINT (-81.516234 39.2116023),1,0,1,1,760.938281,0,1,1,1


#### Age Adjusted Death Rate

In [83]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data_southeast[columns]

# target
y = NRI_county_health_data_southeast["Age_Adjusted_Death_Rate_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=True)
X

Unnamed: 0,RISK_SCORE,Population_Density,risk_score_squared,Community_Type_Small Urban,Community_Type_Urban
0,49.220490,96.260178,2422.656634,True,False
1,97.709195,112.985603,9547.086795,True,False
2,56.188355,27.538003,3157.131246,False,False
3,32.484887,35.161208,1055.267887,False,False
4,65.128858,89.763100,4241.768116,True,False
...,...,...,...,...,...
3043,9.035953,14.877791,81.648445,False,False
3044,24.626153,39.509316,606.447429,False,False
3045,2.099905,21.860882,4.409599,False,False
3046,27.585110,221.346919,760.938281,True,False


In [84]:
# train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)

# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [85]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.25405405405405407

In [86]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.745945945945946

In [87]:
# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data_southeast['SE_AADR_Predicted'] = y_pred_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NRI_county_health_data_southeast['SE_AADR_Predicted'] = y_pred_all


#### Life Expectancy

In [88]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data_southeast[columns]

# target
y = NRI_county_health_data_southeast["Life_Expectancy_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=True)

# train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)

# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [89]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.20540540540540542

In [90]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7945945945945946

In [91]:
# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data_southeast['SE_Life_Expectancy_Predicted'] = y_pred_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NRI_county_health_data_southeast['SE_Life_Expectancy_Predicted'] = y_pred_all


#### Physically Unhealthy Days

In [92]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data_southeast[columns]

# target
y = NRI_county_health_data_southeast["Physically_Unhealthy_Days_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=True)

# train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)

# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [93]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.2702702702702703

In [94]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7297297297297297

In [95]:
# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data_southeast['SE_Physically_Unhealthy_Predicted'] = y_pred_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NRI_county_health_data_southeast['SE_Physically_Unhealthy_Predicted'] = y_pred_all


#### Mentally Unhealthy Days

In [96]:
# features
columns = ["Community_Type", "RISK_SCORE", "Population_Density", "risk_score_squared"]
X = NRI_county_health_data_southeast[columns]

# target
y = NRI_county_health_data_southeast["Mentally_Unhealthy_Days_mean_indicator"]

# Apply one-hot encoding for Community_Type nominal column
X = pd.get_dummies(X, columns=['Community_Type'], prefix='Community_Type', drop_first=True)

# train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create splits
kf = KFold(n_splits=5)

# split data
k_splits = kf.split(X_train)

# create binary decision tree model and train it on the training set
# Create a decision tree model with limitations to avoid overfitting
model = DecisionTreeClassifier(
    max_depth=3,           # Limit the depth of the tree
    min_samples_split=2,   # Require a minimum number of samples to split a node
    min_samples_leaf=2,    # Require a minimum number of samples in a leaf node
    max_features=None,     # Use all features for splitting
    random_state=42
)
model.fit(X_train, y_train)

In [97]:
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.32972972972972975

In [98]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6702702702702703

In [99]:
# make predictions for the entire dataset
y_pred_all = model.predict(X)

# add predicted values column
NRI_county_health_data_southeast['SE_Mentally_Unhealthy_Predicted'] = y_pred_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NRI_county_health_data_southeast['SE_Mentally_Unhealthy_Predicted'] = y_pred_all


In [100]:
importance_values = model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance_values})

# variable feature importance in model
print(feature_importance_df)

                      Feature  Importance
0                  RISK_SCORE    0.431935
1          Population_Density    0.215860
2          risk_score_squared    0.325873
3  Community_Type_Small Urban    0.000000
4        Community_Type_Urban    0.026332


In [101]:
NRI_county_health_data_southeast.head()

Unnamed: 0,FIPS,STATEABBRV,State,County,County_State,Community_Type,POPULATION,AREA,Population_Density,Percent_Rural,...,Mentally_Unhealthy_Days_mean_indicator,risk_score_squared,Natl_AADR_Predicted,Natl_Life_Expectancy_Predicted,Natl_Physically_Unhealthy_Predicted,Natl_Mentally_Unhealthy_Predicted,SE_AADR_Predicted,SE_Life_Expectancy_Predicted,SE_Physically_Unhealthy_Predicted,SE_Mentally_Unhealthy_Predicted
0,1001.0,AL,Alabama,Autauga,"Autauga, AL",Small Urban,58764.0,610.470508,96.260178,42.002162,...,0,2422.656634,0,1,0,1,1,0,0,1
1,1003.0,AL,Alabama,Baldwin,"Baldwin, AL",Small Urban,231365.0,2047.738775,112.985603,42.279099,...,0,9547.086795,0,1,0,1,1,0,1,1
2,1005.0,AL,Alabama,Barbour,"Barbour, AL",Rural,25160.0,913.646511,27.538003,67.789635,...,1,3157.131246,1,0,1,1,1,0,1,1
3,1007.0,AL,Alabama,Bibb,"Bibb, AL",Rural,22239.0,632.486798,35.161208,68.352607,...,1,1055.267887,1,0,1,1,1,0,1,1
4,1009.0,AL,Alabama,Blount,"Blount, AL",Small Urban,58992.0,657.196555,89.7631,89.951502,...,1,4241.768116,0,1,0,1,1,0,1,1


In [102]:
# save updated dataset
#NRI_county_health_data_southeast.to_csv("NRI_county_health_data_predictions_southeast.csv", sep=',', index=False, encoding='utf-8')




In [106]:
# Life Expectancy Accuracy

# Check if values in 'Column1' are equal to values in 'Column2' for each row
NRI_county_health_data_southeast['AADR_Match'] = NRI_county_health_data_southeast['Age_Adjusted_Death_Rate_mean_indicator'] == NRI_county_health_data_southeast['SE_AADR_Predicted']

# Calculate the percentage of matching rows
percentage_matching = (NRI_county_health_data_southeast['AADR_Match'].sum() / len(NRI_county_health_data_southeast)) * 100
percentage_matching


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NRI_county_health_data_southeast['AADR_Match'] = NRI_county_health_data_southeast['Age_Adjusted_Death_Rate_mean_indicator'] == NRI_county_health_data_southeast['SE_AADR_Predicted']


77.92207792207793