In [1]:
import numpy as np
import pandas as pd
import csv

import sqlite3

from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

# Setting engine for SQLite connection
engine = create_engine("sqlite:///birthdata.sqlite", echo=False)


In [42]:
# Checking classes were made successfully: https://stackoverflow.com/questions/42946174/sqlalchemy-automap-not-generating-base-classes-table-name
engine = create_engine("sqlite:///birthdata.sqlite", echo=False)

# Declare a Base using `automap_base()`
Base = automap_base()

# Use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)

# Print all of the classes mapped to the Base
print(Base.classes.keys())

# Create a session
session = Session(engine)


['clinic2010_county', 'clinic2010_state', 'clinic2015_county', 'clinic2015_state', 'county', 'national', 'outcomes']


In [43]:
# Assign the classes to variables
County2015 = Base.classes.clinic2015_county
State2015 = Base.classes.clinic2015_state
County2010 = Base.classes.clinic2010_county
State2010 = Base.classes.clinic2010_state
Outcomes = Base.classes.outcomes
County_births = Base.classes.county
State_births = Base.classes.national

In [44]:
selected = County2015

# Display the row's columns and data in dictionary format
first_row = session.query(selected).first()
first_row.__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x7fe53eba3af0>,
 'fed_center': 1,
 'other_clinic_client': 0,
 'pp_tt': 0,
 'dept_clinic': 1,
 'total_client_tt': 870.0,
 'other_clinic_tt': 0,
 'hospital': 0,
 'fed_client_tt': 0,
 'index': 0,
 'total_client': 1040.0,
 'pp': 0,
 'dept_clinic_client_tt': 870,
 'fed_client': 170,
 'state': 'Alabama',
 'other_clinic': 0,
 'hospital_client_tt': 0,
 'fips': 1001,
 'dept_clinic_client': 870,
 'total_titleten': 1,
 'pp_client_tt': 0,
 'county': 'Autauga ',
 'hospital_client': 0,
 'fed_center_tt': 0,
 'other_clinic_client_tt': 0,
 'total_clinics': 2,
 'pp_client': 0,
 'dept_clinic_tt': 1,
 'hospital_tt': 0}

In [45]:
# querying sqlite

join_query = session.query(County2015.fips, County2015.total_clinics, County2015.total_titleten, County2015.pp,\
                           County2015.dept_clinic, County2015.hospital, County2015.total_client_tt,\
                           County2015.pp_client, County2015.dept_clinic_tt, County2015.pp_tt,\
                           County2015.total_client, County2015.hospital_client, County_births.birth_rate,\
                           County_births.year, County_births.state, County_births.county)\
                    .join(County_births, County_births.combined_fips_code == County2015.fips)\
                    .filter(County_births.year=="2016")


county_df = pd.DataFrame(join_query, columns=["FIPS", "total_clinics", "total_title10", "total_pp", "health_dept_clinics", 
                                              "hospitals","title_10_clients","pp_clients", "dept_clinic_title10","pp_tt",
                                              "total_clients","hospital_client","birth_rate", "year", "state", "county"])

# Drop NaN rows, which will mess with the ML
county_df = county_df.dropna()

In [46]:
session.close()

In [47]:
county_df.head()

Unnamed: 0,FIPS,total_clinics,total_title10,total_pp,health_dept_clinics,hospitals,title_10_clients,pp_clients,dept_clinic_title10,pp_tt,total_clients,hospital_client,birth_rate,year,state,county
0,1001,2,1,0,1,0,870.0,0,1,0,1040.0,0,23.1,2016,Alabama,Autauga
1,1003,6,2,0,2,0,990.0,0,2,0,2010.0,0,25.6,2016,Alabama,Baldwin
2,1005,3,2,0,2,0,900.0,0,2,0,940.0,0,36.6,2016,Alabama,Barbour
3,1007,5,1,0,1,0,510.0,0,1,0,710.0,0,36.5,2016,Alabama,Bibb
4,1009,2,1,0,1,0,1200.0,0,1,0,1290.0,0,30.6,2016,Alabama,Blount


In [48]:
county_df.shape

(3100, 16)

In [49]:
county_df["year"].unique()

array([2016])

In [2]:
# adding population and SVI data

county_populations = pd.read_csv("county_populations.csv", encoding='latin-1')
county_populations

Unnamed: 0,state,county,2010_population,2015_population
0,Iowa,Adair,7682,7145
1,Kentucky,Adair,18656,19162
2,Missouri,Adair,25607,25353
3,Oklahoma,Adair,22683,22259
4,Colorado,Adams,441603,490443
...,...,...,...,...
3136,California,Yuba,72155,74045
3137,Alaska,Yukon-Koyukuk,5588,5465
3138,Texas,Zapata,14018,14493
3139,Texas,Zavala,11677,12310


In [51]:
county_df = county_df.merge(county_populations, how='left', on=["state","county"])

In [52]:
county_SVI = pd.read_csv("2016_SVI_extract.csv")
county_SVI=county_SVI.drop(columns=["state","county"])

In [53]:
county_df=county_df.merge(county_SVI, how='left', on="FIPS")

In [55]:
# transforming data to a per capita basis

county_df["clinics_per_capita"] = county_df["total_clinics"]/county_df["2015_population"]
county_df["title10_clinics_per_capita"] = county_df["total_title10"]/county_df["2015_population"]
county_df["pp_per_capita"] = county_df["total_pp"]/county_df["2015_population"]
county_df["health_dept_per_capita"] = county_df['health_dept_clinics']/county_df["2015_population"]
county_df["hospitals_per_capita"] = county_df['hospitals']/county_df["2015_population"]
county_df["title_10_clients_per_capita"] = county_df["title_10_clients"]/county_df["2015_population"]
county_df["pp_clients_per_capita"] = county_df['pp_clients']/county_df["2015_population"]
county_df["dept_clinic_title10_per_capita"] = county_df['dept_clinic_title10']/county_df["2015_population"]

In [56]:
county_df = county_df.dropna()
county_df.shape

(3071, 29)

In [57]:
county_df.columns

Index(['FIPS', 'total_clinics', 'total_title10', 'total_pp',
       'health_dept_clinics', 'hospitals', 'title_10_clients', 'pp_clients',
       'dept_clinic_title10', 'pp_tt', 'total_clients', 'hospital_client',
       'birth_rate', 'year', 'state', 'county', '2010_population',
       '2015_population', 'SVI_sum_of_indicators', 'SVI_ranking',
       'percent_uninsured', 'clinics_per_capita', 'title10_clinics_per_capita',
       'pp_per_capita', 'health_dept_per_capita', 'hospitals_per_capita',
       'title_10_clients_per_capita', 'pp_clients_per_capita',
       'dept_clinic_title10_per_capita'],
      dtype='object')

In [58]:
# prepping the model

X = county_df[['clinics_per_capita',
       'title10_clinics_per_capita', 'pp_per_capita', 'health_dept_per_capita',
       'hospitals_per_capita', 'title_10_clients_per_capita',
       'pp_clients_per_capita', 'dept_clinic_title10_per_capita','percent_uninsured','SVI_sum_of_indicators']]
y = county_df["birth_rate"]

feature_names = ['clinics_per_capita',
       'title10_clinics_per_capita', 'pp_per_capita', 'health_dept_per_capita',
       'hospitals_per_capita', 'title_10_clients_per_capita',
       'pp_clients_per_capita', 'dept_clinic_title10_per_capita','percent_uninsured','SVI_sum_of_indicators']

print("Shape: ", X.shape, y.shape)

Shape:  (3071, 10) (3071,)


In [59]:
# train test split

from sklearn.model_selection import train_test_split # may not apply to all models


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [60]:
# create base model

from sklearn.ensemble import RandomForestRegressor

base_model = RandomForestRegressor(max_depth=7, n_estimators=100, random_state=1)

In [61]:
base_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=7, random_state=1)

In [62]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to make predictions
base_model_predicted = base_model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, base_model_predicted)
r2 = r2_score(y_test, base_model_predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

Mean Squared Error (MSE): 81.95427809246755
R-squared (R2 ): 0.5053740044409091


In [63]:
sorted(zip(base_model.feature_importances_, feature_names), reverse=True)

[(0.7023997185548906, 'SVI_sum_of_indicators'),
 (0.09968606609982379, 'percent_uninsured'),
 (0.04802607734822058, 'pp_clients_per_capita'),
 (0.033695275236732776, 'pp_per_capita'),
 (0.03125840246242696, 'clinics_per_capita'),
 (0.021647816730657146, 'health_dept_per_capita'),
 (0.021457949270325172, 'title10_clinics_per_capita'),
 (0.0209969622034526, 'title_10_clients_per_capita'),
 (0.017878148764163024, 'dept_clinic_title10_per_capita'),
 (0.002953583329307376, 'hospitals_per_capita')]

In [64]:
# gridsearch with random forest: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74a77dd74

# start with a randomized search to narrow down the field

from sklearn.model_selection import RandomizedSearchCV

params = {'bootstrap': [True, False],
         'max_depth': [3,5,10,20,50, None],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [100,200, 400, 600, 800, 1000, 1250, 1500, 2000]}

rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = params, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [65]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.4min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 5, 10, 20, 50, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 400, 600,
                                                         800, 1000, 1250, 1500,
                                                         2000]},
                   random_state=42, verbose=2)

In [66]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [67]:
random_tuned_model = rf_random.best_estimator_

In [68]:
random_tuned_predictions = random_tuned_model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, random_tuned_predictions)
r2 = r2_score(y_test, random_tuned_predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")


Mean Squared Error (MSE): 78.19723176946061
R-squared (R2 ): 0.528049242648513


In [74]:
# grid search based on best parameters from random search

from sklearn.model_selection import GridSearchCV

params = {'bootstrap': [True],
         'max_depth': [8,10,15],
         'max_features': ['sqrt'],
         'min_samples_leaf': [3, 4, 6],
         'min_samples_split': [8,10,20],
         'n_estimators': [1500,2000,3000]}

rf_grid = GridSearchCV(estimator = RandomForestRegressor(), param_grid = params, cv = 3, n_jobs = -1, verbose = 2)

In [75]:
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:  4.3min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 10, 15],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [3, 4, 6],
                         'min_samples_split': [8, 10, 20],
                         'n_estimators': [1500, 2000, 3000]},
             verbose=2)

In [78]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 8,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 1500}

In [79]:
grid_tuned_model = rf_grid.best_estimator_

In [80]:
grid_tuned_predictions = grid_tuned_model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, grid_tuned_predictions)
r2 = r2_score(y_test, grid_tuned_predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

Mean Squared Error (MSE): 78.61278744209343
R-squared (R2 ): 0.5255412022744118


In [81]:
# at this point we're not getting much benefit from continued tuning

final_model=grid_tuned_model

In [82]:
# feature importance

sorted(zip(final_model.feature_importances_, feature_names), reverse=True)

[(0.4561716889778693, 'SVI_sum_of_indicators'),
 (0.2500410024861704, 'percent_uninsured'),
 (0.06558019560976627, 'clinics_per_capita'),
 (0.06520519725046772, 'health_dept_per_capita'),
 (0.036796023389452176, 'title10_clinics_per_capita'),
 (0.03587723852244998, 'dept_clinic_title10_per_capita'),
 (0.03196288983171379, 'title_10_clients_per_capita'),
 (0.02827097278364375, 'pp_clients_per_capita'),
 (0.02594190191801612, 'pp_per_capita'),
 (0.004152889230450313, 'hospitals_per_capita')]

In [None]:
# final model is ready to pickle!