In [56]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, median_absolute_error
from sklearn.pipeline import Pipeline

In [2]:
abbrev = pd.read_csv("data/abbrev.csv")
abbrev.columns

Index(['Unnamed: 0', 'household_size', 'empl_agriculture', 'empl_professional',
       'empl_social', 'empl_services', 'empl_manufacturing', 'empl_retail',
       'prc_fam_poverty', 'avg_income', 'prc_public_transp', 'population',
       'pop_65_plus', 'health_ins', 'county', 'state', 'area', 'prc_obese',
       'domestic_passengers', 'intl_passengers', 'deaths', 'cases_march1',
       'cases_march15', 'cases_april1', 'ten_plus', 'order', 'density',
       'death_prc'],
      dtype='object')

In [3]:
abbrev = abbrev.drop('Unnamed: 0', axis=1)

In [4]:
predictors = ['household_size', 'empl_agriculture', 'empl_professional','empl_social', 'empl_services', 'empl_manufacturing', 'empl_retail',
              'prc_fam_poverty', 'avg_income', 'prc_public_transp', 'population', 'pop_65_plus', 'health_ins', 'area', 
              'domestic_passengers', 'intl_passengers', 'prc_obese', 'ten_plus', 'order', 'density', 'cases_march1',
             'cases_march15']
# omitting april1 cases data, since it's pretty much perfectly correlated...

predictors_noCaseData = predictors.copy()
predictors_noCaseData.remove('cases_march1')
predictors_noCaseData.remove('cases_march15')
predictors_noCaseData

['household_size',
 'empl_agriculture',
 'empl_professional',
 'empl_social',
 'empl_services',
 'empl_manufacturing',
 'empl_retail',
 'prc_fam_poverty',
 'avg_income',
 'prc_public_transp',
 'population',
 'pop_65_plus',
 'health_ins',
 'area',
 'domestic_passengers',
 'intl_passengers',
 'prc_obese',
 'ten_plus',
 'order',
 'density']

In [5]:
abbrev.loc[abbrev["intl_passengers"] == 0, "intl_passengers"] = 0.0000001
abbrev.loc[abbrev["domestic_passengers"] == 0, "domestic_passengers"] = 0.0000001
abbrev.loc[abbrev["cases_march1"] == 0, "cases_march1"] = 0.0000001
abbrev.loc[abbrev["cases_march15"] == 0, "cases_march15"] = 0.0000001

transform = ["population", "density", "intl_passengers", "domestic_passengers", "area", 
            "cases_march1", "cases_march15"]
for field in transform:
    abbrev["log_"+field] = np.log1p(abbrev[field])

In [6]:
log_predictors = predictors.copy()
for field in transform:
    log_predictors.remove(field)
    log_predictors.append("log_"+field)
print(log_predictors)

['household_size', 'empl_agriculture', 'empl_professional', 'empl_social', 'empl_services', 'empl_manufacturing', 'empl_retail', 'prc_fam_poverty', 'avg_income', 'prc_public_transp', 'pop_65_plus', 'health_ins', 'prc_obese', 'ten_plus', 'order', 'log_population', 'log_density', 'log_intl_passengers', 'log_domestic_passengers', 'log_area', 'log_cases_march1', 'log_cases_march15']


In [7]:
log_predictors_noCaseData = log_predictors.copy()
log_predictors_noCaseData.remove('log_cases_march1')
log_predictors_noCaseData.remove('log_cases_march15')
print(log_predictors_noCaseData)

['household_size', 'empl_agriculture', 'empl_professional', 'empl_social', 'empl_services', 'empl_manufacturing', 'empl_retail', 'prc_fam_poverty', 'avg_income', 'prc_public_transp', 'pop_65_plus', 'health_ins', 'prc_obese', 'ten_plus', 'order', 'log_population', 'log_density', 'log_intl_passengers', 'log_domestic_passengers', 'log_area']


In [8]:
import re
def get_col_name(feature_set, name):
    number = re.compile("x(\d+)")
    matched = number.match(name)
    if matched:
        n = int(matched.group(1))
        col_name = feature_set[n]
    else:
        col_name = "X"
    return col_name

def transform_name(feature_set, name):
    interaction = re.compile("(\w+) (\w+)")
    matched = interaction.match(name)
    col_name = ""
    if matched:
        name1 = get_col_name(feature_set, matched.group(1))
        name2 = get_col_name(feature_set, matched.group(2))
        col_name = name1 + ":" + name2
    else:
        col_name = get_col_name(feature_set, name)
    return col_name

In [9]:
poly = PolynomialFeatures(interaction_only=True)
scaler = StandardScaler()
# https://stats.stackexchange.com/questions/29781/when-conducting-multiple-regression-when-should-you-center-your-predictor-varia
# https://stats.stackexchange.com/questions/25690/multiple-linear-regression-for-hypothesis-testing#25707

scaler.fit(abbrev[predictors])
inter = pd.DataFrame(poly.fit_transform(scaler.transform(abbrev[predictors])), 
                     columns=[transform_name(predictors, x) for x in poly.get_feature_names()])

scaler.fit(abbrev[predictors_noCaseData])
inter_noCases = pd.DataFrame(poly.fit_transform(scaler.transform(abbrev[predictors_noCaseData])), 
                             columns=[transform_name(predictors_noCaseData, x) for x in poly.get_feature_names()])

scaler.fit(abbrev[log_predictors])
inter_log = pd.DataFrame(poly.fit_transform(scaler.transform(abbrev[log_predictors])), 
                         columns=[transform_name(log_predictors, x) for x in poly.get_feature_names()])

scaler.fit(abbrev[log_predictors_noCaseData])
inter_log_noCases = pd.DataFrame(poly.fit_transform(scaler.transform(abbrev[log_predictors_noCaseData])), 
                                 columns=[transform_name(log_predictors_noCaseData, x) for x in poly.get_feature_names()])

In [10]:
inter = inter.drop("X", axis=1)
inter_noCases = inter_noCases.drop("X", axis=1)
inter_log = inter_log.drop("X", axis=1)
inter_log_noCases = inter_log_noCases.drop("X", axis=1)

In [11]:
x_data = [abbrev[predictors_noCaseData], abbrev[predictors], abbrev[log_predictors_noCaseData], abbrev[log_predictors],  
          inter_noCases, inter, inter_log_noCases,  inter_log]
labels = ["no case data", "with case data", "log predictors, no case data", "log predictors, with case data", 
          "no case data, with interactions", "with case data, with interactions", 
          "log predictors, no case data, with interactions", "log predictors, with case data, with interactions"]

## Decision Trees

In [36]:
best_params = list()
best_score = list()

dtr = DecisionTreeRegressor(random_state=1001)
params = {"max_depth":[None, 5, 10, 20, 50, 75, 100, 150], "min_samples_split":[2,5,10,20,50, 75]}
r2_scorer = make_scorer(r2_score)
search = GridSearchCV(dtr, params, scoring=r2_scorer, cv=10)

for dat in x_data:
    search.fit(dat, abbrev["deaths"])
    best_params.append(search.best_params_)
    best_score.append(search.best_score_)



In [37]:
for a, b, c in zip(labels, best_score, best_params):
    print("{:50} : {:>5.3}".format(a, b))
    print(c)
    print()

no case data                                       : 0.209
{'max_depth': 5, 'min_samples_split': 75}

with case data                                     : 0.177
{'max_depth': 10, 'min_samples_split': 75}

log predictors, no case data                       : -17.3
{'max_depth': 10, 'min_samples_split': 2}

log predictors, with case data                     : 0.177
{'max_depth': 10, 'min_samples_split': 75}

no case data, with interactions                    : -53.5
{'max_depth': 5, 'min_samples_split': 2}

with case data, with interactions                  : 0.0834
{'max_depth': 20, 'min_samples_split': 20}

log predictors, no case data, with interactions    : -17.5
{'max_depth': 10, 'min_samples_split': 50}

log predictors, with case data, with interactions  : 0.126
{'max_depth': None, 'min_samples_split': 5}



### try with transformed target 

couldn't get pipeline to work with TransformedTargetRegressor + DecisionTreeRegressor(), so created a new column with manually transformed dependent variable

In [45]:
abbrev["log_deaths"] = np.log1p(abbrev["deaths"])

In [49]:
best_params = list()
best_score = list()

dtr = DecisionTreeRegressor(random_state=1001)
params = {"max_depth":[None, 5, 10, 20, 50, 75, 100, 150], 
          "min_samples_split":[2, 5, 10, 20, 50, 75, 100, 150]}
r2_scorer = make_scorer(r2_score)
search = GridSearchCV(dtr, params, scoring=r2_scorer, cv=10)

for dat in x_data:
    search.fit(dat, abbrev["log_deaths"])
    best_params.append(search.best_params_)
    best_score.append(search.best_score_)



In [50]:
for a, b, c in zip(labels, best_score, best_params):
    print("{:50} : {:>5.3}".format(a, b))
    print(c)
    print()

no case data                                       : 0.581
{'max_depth': 5, 'min_samples_split': 20}

with case data                                     : 0.578
{'max_depth': 5, 'min_samples_split': 20}

log predictors, no case data                       : 0.581
{'max_depth': 5, 'min_samples_split': 20}

log predictors, with case data                     : 0.578
{'max_depth': 5, 'min_samples_split': 20}

no case data, with interactions                    : 0.548
{'max_depth': 5, 'min_samples_split': 150}

with case data, with interactions                  : 0.541
{'max_depth': 5, 'min_samples_split': 150}

log predictors, no case data, with interactions    : 0.565
{'max_depth': None, 'min_samples_split': 100}

log predictors, with case data, with interactions  : 0.565
{'max_depth': 5, 'min_samples_split': 100}



## Random Forest

In [None]:
best_params = list()
best_score = list()

dtr = DecisionTreeRegressor(random_state=1001)
rfr = RandomForestRegressor(random_state=1001)
params = {'n_estimators':[50,100,150,250,500], 
          'max_depth':[None,5,10,15,25,50,100]}
r2_scorer = make_scorer(r2_score)
search = GridSearchCV(rfr, params, scoring=r2_scorer, cv=10)

for dat in x_data:
    search.fit(dat, abbrev["deaths"])
    best_params.append(search.best_params_)
    best_score.append(search.best_score_)
    
for a, b, c in zip(labels, best_score, best_params):
    print("{:50} : {:>5.3}".format(a, b))
    print(c)
    print()



In [54]:
# using transformed target/dependent variable

best_params = list()
best_score = list()

dtr = DecisionTreeRegressor(random_state=1001)
rfr = RandomForestRegressor(random_state=1001)
params = {'n_estimators':[50,100,150,250,500], 
          'max_depth':[None,5,10,15,25,50,100], 
          'min_samples_split':[2,5,10,20,50,100]}
r2_scorer = make_scorer(r2_score)
search = GridSearchCV(rfr, params, scoring=r2_scorer, cv=10)

for dat in x_data:
    search.fit(dat, abbrev["log_deaths"])
    best_params.append(search.best_params_)
    best_score.append(search.best_score_)
    
for a, b, c in zip(labels, best_score, best_params):
    print("{:50} : {:>5.3}".format(a, b))
    print(c)
    print()

KeyboardInterrupt: 