# Tree Models for Project 3

## Setup

### Import Libraries

In [1]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

### Set File Locations

In [2]:
# note that some of the raw data files are very large
# these very large files are located in a gitignored directory.

# cleaned, merged data
merged_data_csv = "../00_Data/cleaned_data/cleaned_merged_data.csv"

## Import Data

In [3]:
# Import census data
data_df = pd.read_csv(merged_data_csv)

data_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 240 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   fips_block_group                  220423 non-null  int64  
 1   state                             220423 non-null  float64
 2   state_name                        220423 non-null  object 
 3   county                            220423 non-null  float64
 4   county_name                       220423 non-null  object 
 5   tract                             220423 non-null  float64
 6   block_group                       220423 non-null  float64
 7   flag                              220423 non-null  float64
 8   land_area                         220423 non-null  float64
 9   aian_land                         220423 non-null  float64
 10  urbanized_area_pop_cen_2010       220423 non-null  float64
 11  urban_cluster_pop_cen_2010        220423 non-null  

## Prep Data

In [4]:
# Prepare the target
target = data_df["has_superfund"]
target_names = ["negative", "positive"]

In [5]:
# Prepare the features
# Drop all the columns that came in from the site data. This prevents 'trailing indicators' from getting into the model.
# Also drop any column that shouldn't mathematically matter, such as FIPS, tract, etc.
drop_list = ['fips_block_group',
            'state',
            'state_name',
            'county',
            'county_name',
            'tract',
            'block_group',
            'has_superfund',
            'fips_full',
            'address',
            'city',
            'date_added',
            'federal_facility_ind',
            'federal_register_url',
            'geocode_source',
            'latitude',
            'longitude',
            'site_epa_id',
            'site_name',
            'site_narrative_url',
            'site_progress_url',
            'site_score',
            'site_text'
            ]

data_df.drop(data_df[drop_list],axis=1,inplace=True)
feature_names = data_df.columns

data_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 217 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   flag                              220423 non-null  float64
 1   land_area                         220423 non-null  float64
 2   aian_land                         220423 non-null  float64
 3   urbanized_area_pop_cen_2010       220423 non-null  float64
 4   urban_cluster_pop_cen_2010        220423 non-null  float64
 5   rural_pop_cen_2010                220423 non-null  float64
 6   tot_population_cen_2010           220423 non-null  float64
 7   tot_population_acs_09_13          220423 non-null  float64
 8   males_cen_2010                    220423 non-null  float64
 9   males_acs_09_13                   220423 non-null  float64
 10  females_cen_2010                  220423 non-null  float64
 11  females_acs_09_13                 220423 non-null  

### train/test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data_df, target, random_state=42)

## Tree Classifier

In [7]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9854462309004464

In [8]:
# use a confusion matrix to inspect the score
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.05      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.52      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106



In [9]:
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.01730011643910864, 'pct_pop_5_17_acs_09_13'),
 (0.014780387945569311, 'pct_diff_hu_1yr_ago_acs_09_13'),
 (0.01434745141293661, 'pct_no_ph_srvc_acs_09_13'),
 (0.013266084689594969, 'land_area'),
 (0.012566182408610932, 'pct_pop_65plus_cen_2010'),
 (0.012507821160895684, 'pct_female_no_hb_cen_2010'),
 (0.012348775875057992, 'med_hhd_inc_tr_acs_09_13'),
 (0.012261645661285294, 'pct_nh_asian_alone_cen_2010'),
 (0.010977832324519763, 'pct_one_health_ins_acs_09_13'),
 (0.0106386878078925, 'pct_pop_45_64_cen_2010'),
 (0.01059178710463402, 'pct_female_no_hb_acs_09_13'),
 (0.010292796529081159, 'pct_nh_white_alone_cen_2010'),
 (0.010116916838196266, 'pct_rplcmnt_frms_cen_2010'),
 (0.009767538047184804, 'low_response_score'),
 (0.00969907317518568, 'pct_nh_sor_alone_acs_09_13'),
 (0.009617460898732189, 'pct_pop_under_5_cen_2010'),
 (0.009544996829444312, 'mail_return_rate_cen_2010'),
 (0.009517654938467564, 'pct_not_hs_grad_acs_09_13'),
 (0.009329445428179494, 'pct_tot_occp_units_acs_09_13')

## Random Forest Classifier

In [10]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9947192683192393

In [11]:
predictions = rf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106



In [12]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.013201947488132556, 'land_area'),
 (0.008060675798802935, 'pct_males_acs_09_13'),
 (0.0076056341778673285, 'pct_females_acs_09_13'),
 (0.007442755330691164, 'pct_nh_white_alone_cen_2010'),
 (0.007425165621844929, 'pct_diff_hu_1yr_ago_acs_09_13'),
 (0.007370253399958897, 'med_hhd_inc_tr_acs_09_13'),
 (0.007206841289274866, 'pct_males_cen_2010'),
 (0.007204796114849682, 'pct_females_cen_2010'),
 (0.007100511645621101, 'med_house_value_tr_acs_09_13'),
 (0.0069998671771685185, 'pct_nh_white_alone_acs_09_13'),
 (0.006724539220191881, 'pct_pop_5_17_acs_09_13'),
 (0.006556229848420064, 'pct_one_health_ins_acs_09_13'),
 (0.006481781333288649, 'pct_pop_45_64_acs_09_13'),
 (0.006343361569556447, 'pct_hispanic_cen_2010'),
 (0.00624153637014863, 'pct_census_mail_returns_cen_2010'),
 (0.006213999731074946, 'pct_vacant_units_cen_2010'),
 (0.006205598818080157, 'pct_pop_25_44_acs_09_13'),
 (0.00608630591223454, 'pct_female_no_hb_cen_2010'),
 (0.006074883115822818, 'pct_nh_asian_alone_cen_2010'),


## Balanced Tree Classifier

In [13]:
bclf = tree.DecisionTreeClassifier(class_weight='balanced')
bclf = bclf.fit(X_train, y_train)
bclf.score(X_test, y_test)

0.9869524189743404

In [14]:
predictions = bclf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.06      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.53      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106



In [15]:
sorted(zip(bclf.feature_importances_, feature_names), reverse=True)

[(0.11724573589318835, 'land_area'),
 (0.028544004661585427, 'low_response_score'),
 (0.019248994779615804, 'mlt_u2_9_strc_acs_09_13'),
 (0.01508960946629992, 'pct_female_no_hb_cen_2010'),
 (0.014918776566554109, 'pct_nh_white_alone_acs_09_13'),
 (0.014468146232456293, 'urbanized_area_pop_cen_2010'),
 (0.01303093904716628, 'pct_hhd_moved_in_acs_09_13'),
 (0.012612476662964568, 'pct_pop_5_17_cen_2010'),
 (0.012328101479705037, 'pct_pop_18_24_acs_09_13'),
 (0.01226997081443489, 'mrdcple_fmly_hhd_acs_09_13'),
 (0.012230638821655366, 'pct_hhd_ppl_und_18_acs_09_13'),
 (0.011891882997985529, 'pct_rel_under_6_acs_09_13'),
 (0.011774894846258823, 'pct_pop_45_64_acs_09_13'),
 (0.011377863096105027, 'pct_not_hs_grad_acs_09_13'),
 (0.011055803256880896, 'pop_under_5_acs_09_13'),
 (0.010900553844144059, 'pct_college_acs_09_13'),
 (0.010675149580599485, 'pct_sngl_prns_hhd_acs_09_13'),
 (0.010623464602472777, 'pct_rel_under_6_cen_2010'),
 (0.010608717553493918, 'occp_u_no_ph_srvc_acs_09_13'),
 (0.01

## Balanced Random Forest Classifier

In [16]:
brf = RandomForestClassifier(n_estimators=200, class_weight='balanced')
brf = brf.fit(X_train, y_train)
brf.score(X_test, y_test)

0.9947192683192393

In [17]:
predictions = brf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106



In [18]:
sorted(zip(brf.feature_importances_, feature_names), reverse=True)

[(0.05293925315812843, 'land_area'),
 (0.00948217588156491, 'pct_females_cen_2010'),
 (0.00916400131578327, 'pct_males_cen_2010'),
 (0.008470465250585571, 'pct_females_acs_09_13'),
 (0.008349168244652908, 'pct_males_acs_09_13'),
 (0.007903415867742474, 'pct_single_unit_acs_09_13'),
 (0.007841550934583755, 'pct_college_acs_09_13'),
 (0.007749834147285684, 'pct_nh_white_alone_cen_2010'),
 (0.007596587438455515, 'med_house_value_tr_acs_09_13'),
 (0.007481995623847646, 'pct_mailback_count_cen_2010'),
 (0.007271604828897352, 'rural_pop_cen_2010'),
 (0.007235937799914012, 'pct_mlt_u2_9_strc_acs_09_13'),
 (0.007215063336576756, 'pct_one_health_ins_acs_09_13'),
 (0.007202190417041919, 'pct_hispanic_cen_2010'),
 (0.006975961766929038, 'mail_return_rate_cen_2010'),
 (0.006942695737377074, 'pct_nh_white_alone_acs_09_13'),
 (0.006810960862906967, 'pct_not_hs_grad_acs_09_13'),
 (0.006810637661126554, 'pct_pop_65plus_cen_2010'),
 (0.0067653817873968985, 'pct_pop_45_64_acs_09_13'),
 (0.00672805557099