Source: https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

In [199]:
import pandas as pd
import random
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from collections import defaultdict
import missingno as msno

In [191]:
df = pd.read_csv("Data/county_data_clean.csv").drop(columns="Unnamed: 0")
df = df[df.state_abbreviation!='US']

In [192]:
df.shape

(3193, 156)

In [193]:
features_df = df.drop(['uninsured_adults', 'uninsured_children', 'county_fips_code',
                   '5_digit_fips_code', 'name'], axis = 1)
# Replacing missing value of uninsured with mean of uninsured
features_df['uninsured'] = features_df.uninsured.replace([float("nan")], features_df.uninsured.mean())

for column in features_df.columns:
    na_sum= (features_df[column].isna().sum())
    column_size = (features_df[column].size)
    #print(na_sum/column_size)
    if (features_df[column].isna().sum()/features_df[column].size) > 0.25:
        features_df = features_df.drop(columns=column)
features_df.shape

(3193, 76)

In [194]:
state_groupby = features_df.groupby('state_abbreviation').count()

In [195]:
state_groupby = features_df.groupby('state_abbreviation').count()
null_count_dict = defaultdict(int)

for column in state_groupby.columns:
    null_count_dict[column] = 0
    for state in state_groupby.index:
        value = state_groupby.loc[state][column]
        size = state_groupby.loc[state]['state_fips_code']
        if (value/size) <= 0.25:
            print("{0} is not useful for {1}: {2}".format(column, state, str(value/size)))
            null_count_dict[column] += 1
            
exclude_features = []
for feature, null_count in null_count_dict.items():
    if null_count > 13:
        exclude_features.append(feature.strip(""))

null_count_dict

air_pollution__particulate_matter is not useful for AK: 0.0
air_pollution__particulate_matter is not useful for HI: 0.0
drinking_water_violations is not useful for HI: 0.0
hiv_prevalence is not useful for ND: 0.24074074074074073
hiv_prevalence is not useful for NE: 0.13829787234042554
hiv_prevalence is not useful for SD: 0.014925373134328358
children_eligible_for_free_or_reduced_price_lunch is not useful for DC: 0.0
children_eligible_for_free_or_reduced_price_lunch is not useful for DE: 0.0
children_eligible_for_free_or_reduced_price_lunch is not useful for MA: 0.0
children_eligible_for_free_or_reduced_price_lunch is not useful for TN: 0.0


defaultdict(int,
            {'state_fips_code': 0,
             'premature_death': 0,
             'poor_or_fair_health': 0,
             'poor_physical_health_days': 0,
             'poor_mental_health_days': 0,
             'low_birthweight': 0,
             'adult_smoking': 0,
             'adult_obesity': 0,
             'food_environment_index': 0,
             'physical_inactivity': 0,
             'access_to_exercise_opportunities': 0,
             'excessive_drinking': 0,
             'alcohol_impaired_driving_deaths': 0,
             'sexually_transmitted_infections': 0,
             'teen_births': 0,
             'uninsured': 0,
             'primary_care_physicians': 0,
             'ratio_of_population_to_primary_care_physicians': 0,
             'dentists': 0,
             'ratio_of_population_to_dentists': 0,
             'mental_health_providers': 0,
             'ratio_of_population_to_mental_health_providers': 0,
             'preventable_hospital_stays': 0,
         

In [196]:
features_df = features_df.drop(columns = exclude_features)

In [197]:
features_df.columns

Index(['state_fips_code', 'state_abbreviation', 'premature_death',
       'poor_or_fair_health', 'poor_physical_health_days',
       'poor_mental_health_days', 'low_birthweight', 'adult_smoking',
       'adult_obesity', 'food_environment_index', 'physical_inactivity',
       'access_to_exercise_opportunities', 'excessive_drinking',
       'alcohol_impaired_driving_deaths', 'sexually_transmitted_infections',
       'teen_births', 'uninsured', 'primary_care_physicians',
       'ratio_of_population_to_primary_care_physicians', 'dentists',
       'ratio_of_population_to_dentists', 'mental_health_providers',
       'ratio_of_population_to_mental_health_providers',
       'preventable_hospital_stays', 'mammography_screening',
       'flu_vaccinations', 'high_school_graduation', 'some_college',
       'unemployment', 'children_in_poverty', 'children_in_poverty_hispanic',
       'children_in_poverty_white', 'income_inequality',
       'children_in_single_parent_households', 'social_association

In [206]:
features = features_df.drop(['uninsured','state_fips_code','state_abbreviation'], axis = 1)
features_list = features.columns.to_list()
imp = SimpleImputer()
features = imp.fit_transform(features)

labels = np.array(features_df['uninsured'])


In [None]:
# Computing Variance Inflation Factors
sub_df = pd.DataFrame(data=features, columns=features_list)
vif_data_copy = sub_df.copy()


while not vif_data_copy.empty:
    print("entered")
    interesting_vars = sub_df.columns.to_list()
    X = sub_df[interesting_vars]

    # VIF dataframe
    print("entered again")
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns

    # calculating VIF for each feature
    
    vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                              for i in range(len(X.columns))]

    print(vif_data)

    if float('inf') in vif_data["VIF"].unique():
        feature_list = vif_data[vif_data['VIF']==float("inf")].feature.to_list()
        sub_df = sub_df.drop(columns=feature_list[0])
        vif_data_copy = vif_data[vif_data.feature!= feature_list[0]]
    elif ~ vif_data[vif_data['VIF'] >= 5].empty:
        feature_list = vif_data[vif_data['VIF']>= 5].feature.to_list()
        sub_df = sub_df.drop(columns=feature_list[0])
        vif_data_copy = vif_data[vif_data.feature!= feature_list[0]]
    else:
        vif_data_copy = pd.DataFrame()

sub_df

entered
entered again


In [186]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)



In [187]:
# Random forest regressor
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestRegressor(random_state=42)

In [188]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'percent.')

Mean Absolute Error: 0.02 percent.


In [189]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 80.95 %.
