In [150]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


In [162]:
df = pd.read_csv("Data/county_data_clean.csv").drop(columns="Unnamed: 0")

In [106]:
df.shape

(3194, 156)

In [10]:
# Look to see which variables are not collinear
indep_df = df.drop(columns=['state_fips_code', 'county_fips_code', '5_digit_fips_code', 'state_abbreviation', 'name',
                           'uninsured', 'uninsured_adults', 'uninsured_children'])

corr = indep_df.corr()

corr = corr[(corr < 0.7)&(corr > -0.7)]

indep_features = corr.columns.to_list()

indep_features

['premature_death',
 'premature_death_black',
 'premature_death_hispanic',
 'premature_death_white',
 'poor_or_fair_health',
 'poor_physical_health_days',
 'poor_mental_health_days',
 'low_birthweight',
 'low_birthweight_black',
 'low_birthweight_hispanic',
 'low_birthweight_white',
 'adult_smoking',
 'adult_obesity',
 'food_environment_index',
 'physical_inactivity',
 'access_to_exercise_opportunities',
 'excessive_drinking',
 'alcohol_impaired_driving_deaths',
 'sexually_transmitted_infections',
 'teen_births',
 'teen_births_black',
 'teen_births_hispanic',
 'teen_births_white',
 'primary_care_physicians',
 'ratio_of_population_to_primary_care_physicians',
 'dentists',
 'ratio_of_population_to_dentists',
 'mental_health_providers',
 'ratio_of_population_to_mental_health_providers',
 'preventable_hospital_stays',
 'preventable_hospital_stays_black',
 'preventable_hospital_stays_hispanic',
 'preventable_hospital_stays_white',
 'mammography_screening',
 'mammography_screening_black',
 '

In [12]:
df = df[indep_features]

Unnamed: 0,premature_death,premature_death_black,premature_death_hispanic,premature_death_white,poor_or_fair_health,poor_physical_health_days,poor_mental_health_days,low_birthweight,low_birthweight_black,low_birthweight_hispanic,...,male_population_18_44,male_population_45_64,male_population_65,total_male_population,female_population_0_17,female_population_18_44,female_population_45_64,female_population_65,total_female_population,population_growth
0,6900.630354,,,,,,,0.080893,,,...,,,,,,,,,,
1,9917.232898,,,,0.214024,4.400458,4.577367,0.101455,,,...,,,,,,,,,,
2,8824.057123,10471.252986,,8706.658832,0.184111,4.200578,4.306739,0.084757,0.126203,,...,,,,,,,,,,
3,7224.632160,10042.472874,3086.605695,7277.780727,0.180605,4.098748,4.249649,0.083387,0.146861,0.048739,...,,,,,,,,,,
4,9586.165037,11332.562909,,7309.636719,0.257734,5.067438,4.634994,0.109526,0.144968,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,7497.439952,,6413.215910,7834.050381,0.153772,3.536556,3.543546,0.094205,,0.090909,...,,,,,,,,,,
3190,3786.128226,,,,0.121817,3.166316,3.072253,0.074739,,0.079295,...,,,,,,,,,,
3191,7790.302043,,,,0.158858,3.677538,3.699922,0.096475,,0.111111,...,,,,,,,,,,
3192,5504.650970,,,,0.161261,3.601687,3.479694,0.073209,,,...,,,,,,,,,,


In [130]:
# Split up the data set into training and testing
random_train_indices = []

training_length = round((len(df.index.to_list()))* 0.8)

already_in = []
for number in range(training_length):
    random_index = random.randint(0, len(df.index.to_list())-1)
    while random_index in already_in:
        random_index = random.randint(0, len(df.index.to_list())-1)
    random_train_indices.append(random_index)
    already_in.append(random_index)

In [136]:
train_data = df.iloc[random_train_indices].drop(columns=['level_0', 'index'])
train_data.head()



Unnamed: 0,state_fips_code,county_fips_code,5_digit_fips_code,state_abbreviation,name,premature_death,premature_death_black,premature_death_hispanic,premature_death_white,poor_or_fair_health,...,male_population_18_44,male_population_45_64,male_population_65,total_male_population,female_population_0_17,female_population_18_44,female_population_45_64,female_population_65,total_female_population,population_growth
907,20,3,20003,KS,Anderson County,8362.597883,,,,0.151582,...,,,,,,,,,,
180,5,129,5129,AR,Searcy County,10636.495489,,,,0.206718,...,,,,,,,,,,
2673,48,211,48211,TX,Hemphill County,7651.480058,,,,0.164822,...,,,,,,,,,,
1177,22,89,22089,LA,St. Charles Parish,7306.671157,10013.797075,,6246.550385,0.171539,...,,,,,,,,,,
811,19,11,19011,IA,Benton County,6862.75922,,,,0.115603,...,,,,,,,,,,


In [137]:
test_data = df.drop(index = random_train_indices, columns=['level_0', 'index'])
test_data.head()


Unnamed: 0,state_fips_code,county_fips_code,5_digit_fips_code,state_abbreviation,name,premature_death,premature_death_black,premature_death_hispanic,premature_death_white,poor_or_fair_health,...,male_population_18_44,male_population_45_64,male_population_65,total_male_population,female_population_0_17,female_population_18_44,female_population_45_64,female_population_65,total_female_population,population_growth
2,1,1,1001,AL,Autauga County,8824.057123,10471.252986,,8706.658832,0.184111,...,,,,,,,,,,
5,1,7,1007,AL,Bibb County,11783.543675,14812.53928,,11327.563749,0.199969,...,,,,,,,,,,
6,1,9,1009,AL,Blount County,10908.101822,,5619.645186,11336.046321,0.210953,...,,,,,,,,,,
10,1,17,1017,AL,Chambers County,11273.17098,10774.817457,,11794.680978,0.252198,...,,,,,,,,,,
12,1,21,1021,AL,Chilton County,10831.917358,12000.440145,,11110.070619,0.214653,...,,,,,,,,,,


In [138]:
train_data.to_csv("Data/Train/national_training_data.csv")
test_data.to_csv("Data/Test/national_testing_data.csv")

In [164]:
labels = np.array(df['uninsured'])
df.shape

(3194, 156)

In [177]:
df.shape

(3194, 156)

In [205]:
features_df = df.drop(['uninsured','uninsured_children', 'uninsured_adults','state_fips_code', 'county_fips_code',
                   '5_digit_fips_code', 'state_abbreviation', 'name'], axis = 1)

for column in features_df.columns:
    na_sum= (features_df[column].isna().sum())
    column_size = (features_df[column].size)
    #print(na_sum/column_size)
    if (features_df[column].isna().sum()/features_df[column].size) > 0.25:
        features_df = features_df.drop(columns=column)
features_df.shape

(3194, 73)

In [207]:
state_groupby = df.groupby('state_abbreviation').count()
state_groupby

Unnamed: 0_level_0,state_fips_code,county_fips_code,5_digit_fips_code,name,premature_death,premature_death_black,premature_death_hispanic,premature_death_white,poor_or_fair_health,poor_physical_health_days,...,male_population_18_44,male_population_45_64,male_population_65,total_male_population,female_population_0_17,female_population_18_44,female_population_45_64,female_population_65,total_female_population,population_growth
state_abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK,30,30,30,30,26,2,3,3,30,30,...,0,0,0,0,0,0,0,0,0,0
AL,68,68,68,68,68,62,13,63,68,68,...,0,0,0,0,0,0,0,0,0,0
AR,76,76,76,76,76,45,7,46,76,76,...,0,0,0,0,0,0,0,0,0,0
AZ,16,16,16,16,16,8,15,15,16,16,...,0,0,0,0,0,0,0,0,0,0
CA,59,59,59,59,59,37,50,50,59,59,...,0,0,0,0,0,0,0,0,0,0
CO,65,65,65,65,61,11,31,31,65,65,...,0,0,0,0,0,0,0,0,0,0
CT,9,9,9,9,9,8,8,8,9,9,...,0,0,0,0,0,0,0,0,0,0
DC,2,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,0,0,0,0
DE,4,4,4,4,4,3,3,3,4,4,...,0,0,0,0,0,0,0,0,0,0
FL,68,68,68,68,68,62,46,62,68,68,...,0,0,0,0,0,0,0,0,0,0


In [212]:
state_groupby = df.groupby('state_abbreviation').count()

for column in state_groupby.columns:
    state_groupby[column] = state_groupby[column].apply(lambda x: (x/state_groupby['state_fips_code'])<= 0.25)

state_groupby.to_dict()

{'state_fips_code': {'AK': False,
  'AL': False,
  'AR': False,
  'AZ': False,
  'CA': False,
  'CO': False,
  'CT': False,
  'DC': True,
  'DE': True,
  'FL': False,
  'GA': False,
  'HI': True,
  'IA': False,
  'ID': False,
  'IL': False,
  'IN': False,
  'KS': False,
  'KY': False,
  'LA': False,
  'MA': False,
  'MD': False,
  'ME': False,
  'MI': False,
  'MN': False,
  'MO': False,
  'MS': False,
  'MT': False,
  'NC': False,
  'ND': False,
  'NE': False,
  'NH': False,
  'NJ': False,
  'NM': False,
  'NV': False,
  'NY': False,
  'OH': False,
  'OK': False,
  'OR': False,
  'PA': False,
  'RI': True,
  'SC': False,
  'SD': False,
  'TN': False,
  'TX': False,
  'US': True,
  'UT': False,
  'VA': False,
  'VT': False,
  'WA': False,
  'WI': False,
  'WV': False,
  'WY': False},
 'county_fips_code': {'AK': False,
  'AL': False,
  'AR': False,
  'AZ': False,
  'CA': False,
  'CO': False,
  'CT': False,
  'DC': False,
  'DE': False,
  'FL': False,
  'GA': False,
  'HI': False,
  'IA

In [217]:
state_groupby = df.groupby('state_abbreviation').count()

for state in state_groupby.index:
    #print("entered")
    for column in state_groupby.columns:
        na_sum = state_groupby.loc[state][column]
        county_total = state_groupby.loc[state]['state_fips_code']
        print((na_sum/county_total <= 0.25))
        state_groupby = state_groupby.replace(state, (na_sum/county_total <= 0.25))

state_groupby


False
False
False
False
False
True
True
True
False
False
False
False
True
True
True
False
False
False
False
False
False
False
False
False
True
True
True
False
False
False
False
False
False
False
False
True
True
True
False
True
True
True
False
True
True
True
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
True
True
True
False
True
True
True
False
True
True
True
True
True
True
True
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fa

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fal

False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
Tr

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
True
True
True
False
False
False
False
True
True
True
False
False
False
False
False
False
False
False
False
True
True
True
False
False
False
False
False
False
False
False
True
True
True
False
True
True
True
False
True
True
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
False
False
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
False
False
False
True
False
False
True
False
False
False
False
False
False
True
False
True
False
False
False
True
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
True
T

False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
True
True
True
False
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fal

False
False
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
True
True
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
True
True
True
False
True
True
True
False
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

Unnamed: 0_level_0,state_fips_code,county_fips_code,5_digit_fips_code,name,premature_death,premature_death_black,premature_death_hispanic,premature_death_white,poor_or_fair_health,poor_physical_health_days,...,male_population_18_44,male_population_45_64,male_population_65,total_male_population,female_population_0_17,female_population_18_44,female_population_45_64,female_population_65,total_female_population,population_growth
state_abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK,30,30,30,30,26,2,3,3,30,30,...,0,0,0,0,0,0,0,0,0,0
AL,68,68,68,68,68,62,13,63,68,68,...,0,0,0,0,0,0,0,0,0,0
AR,76,76,76,76,76,45,7,46,76,76,...,0,0,0,0,0,0,0,0,0,0
AZ,16,16,16,16,16,8,15,15,16,16,...,0,0,0,0,0,0,0,0,0,0
CA,59,59,59,59,59,37,50,50,59,59,...,0,0,0,0,0,0,0,0,0,0
CO,65,65,65,65,61,11,31,31,65,65,...,0,0,0,0,0,0,0,0,0,0
CT,9,9,9,9,9,8,8,8,9,9,...,0,0,0,0,0,0,0,0,0,0
DC,2,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,0,0,0,0
DE,4,4,4,4,4,3,3,3,4,4,...,0,0,0,0,0,0,0,0,0,0
FL,68,68,68,68,68,62,46,62,68,68,...,0,0,0,0,0,0,0,0,0,0


In [159]:
labels = np.array(df['uninsured'])
df = df.drop(['uninsured','uninsured_children', 'uninsured_adults','state_fips_code', 'county_fips_code',
                   '5_digit_fips_code', 'state_abbreviation', 'name'], axis = 1)
df = df[['teen_births', '%_non_hispanic_white']]
features_list = df.columns.to_list()
features = np.array(df)

features

array([[24.71032641,  0.60728105],
       [33.11347994,  0.65579855],
       [26.52420801,  0.74473912],
       ...,
       [31.79867343,  0.87474994],
       [27.64976959,  0.82428075],
       [32.46239113,  0.90948463]])

In [160]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)



In [161]:
# Random forest regressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').