In [41]:
import pandas as pd
import numpy as np 
import re
from sklearn.preprocessing import StandardScaler#, PolynomialFeatures
import statsmodels.formula.api as sm
#import statsmodels.api as sm
#from sklearn.metrics import make_scorer, r2_score, mean_squared_error, median_absolute_error

In [2]:
abbrev = pd.read_csv("data/abbrev.csv")
abbrev = abbrev.drop('Unnamed: 0', axis=1)
predictors = ['household_size', 'empl_agriculture', 'empl_professional','empl_social', 'empl_services', 'empl_manufacturing', 'empl_retail',
              'prc_fam_poverty', 'avg_income', 'prc_public_transp', 'population', 'pop_65_plus', 'health_ins', 'area', 
              'domestic_passengers', 'intl_passengers', 'prc_obese', 'ten_plus', 'order', 'density', 'cases_march1',
             'cases_march15']
# omitting april1 cases data, since it's pretty much perfectly correlated...

predictors_noCaseData = predictors.copy()
predictors_noCaseData.remove('cases_march1')
predictors_noCaseData.remove('cases_march15')

abbrev.loc[abbrev["intl_passengers"] == 0, "intl_passengers"] = 0.0000001
abbrev.loc[abbrev["domestic_passengers"] == 0, "domestic_passengers"] = 0.0000001
abbrev.loc[abbrev["cases_march1"] == 0, "cases_march1"] = 0.0000001
abbrev.loc[abbrev["cases_march15"] == 0, "cases_march15"] = 0.0000001

transform = ["population", "density", "intl_passengers", "domestic_passengers", "area", 
            "cases_march1", "cases_march15"]
for field in transform:
    abbrev["log_"+field] = np.log1p(abbrev[field])
    
log_predictors = predictors.copy()
for field in transform:
    log_predictors.remove(field)
    log_predictors.append("log_"+field)
    
log_predictors_noCaseData = log_predictors.copy()
log_predictors_noCaseData.remove('log_cases_march1')
log_predictors_noCaseData.remove('log_cases_march15')

In [3]:
indp_vars = [predictors_noCaseData, predictors, 
             log_predictors_noCaseData, log_predictors]
labels = ["no case data", "with case data", 
          "log predictors, no case data", 
          "log predictors, with case data"]

In [5]:
abbrev["log_deaths"] = np.log1p(abbrev["deaths"])

In [47]:
original = abbrev.copy()
all_predictors = set()
for x in indp_vars:
    all_predictors = all_predictors.union(x)

In [48]:
all_predictors = list(all_predictors)
scaler = StandardScaler()
scaler.fit(abbrev[all_predictors])
abbrev[all_predictors] = scaler.transform(abbrev[all_predictors])

In [49]:
abbrev.describe()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,prc_fam_poverty,avg_income,prc_public_transp,...,density,death_prc,log_population,log_density,log_intl_passengers,log_domestic_passengers,log_area,log_cases_march1,log_cases_march15,log_deaths
count,823.0,823.0,823.0,823.0,823.0,823.0,823.0,823.0,823.0,823.0,...,823.0,823.0,823.0,823.0,823.0,823.0,823.0,823.0,823.0,823.0
mean,-1.5631480000000002e-17,9.618336e-17,-2.695292e-16,9.204194e-16,-1.220166e-16,1.1601360000000002e-17,2.441681e-16,-9.712765e-17,-3.283454e-16,-1.510875e-17,...,-4.3167850000000004e-18,5.1e-05,-1.392163e-16,-2.69799e-16,1.699734e-16,9.065248000000001e-17,-1.476205e-15,-3.29863e-16,2.158392e-18,1.713659
std,1.000608,1.000608,1.000608,1.000608,1.000608,1.000608,1.000608,1.000608,1.000608,1.000608,...,1.000608,0.000112,1.000608,1.000608,1.000608,1.000608,1.000608,1.000608,1.000608,1.505851
min,-2.763242,-0.713819,-2.817129,-5.030197,-3.468572,-1.894605,-5.734296,-1.826346,-2.246612,-0.4516647,...,-0.3973592,0.0,-1.266775,-3.346377,-0.504286,-1.136562,-4.334651,-0.1155313,-0.611927,0.0
25%,-0.7217483,-0.5129159,-0.6764601,-0.631902,-0.6104182,-0.7365944,-0.621426,-0.7363487,-0.6654138,-0.3811994,...,-0.3175188,5e-06,-0.7926724,-0.611587,-0.504286,-1.136562,-0.4934726,-0.1155313,-0.611927,0.693147
50%,-0.1613383,-0.3283267,-0.1428487,-0.1449382,-0.1322056,-0.1707755,-0.04889213,-0.1439586,-0.1787571,-0.2872456,...,-0.2403517,1.7e-05,-0.2285975,-0.06160307,-0.504286,0.5890021,-0.07283898,-0.1155313,-0.611927,1.386294
75%,0.4791302,0.1355948,0.5145289,0.5198946,0.4287507,0.5613396,0.5505121,0.5550616,0.4794669,-0.02887279,...,-0.07206297,4.7e-05,0.582663,0.5392498,-0.504286,0.8614245,0.3525009,-0.1155313,0.5693262,2.639057
max,6.08323,8.75908,5.592866,4.703169,8.005615,5.621149,4.852515,4.832118,5.12025,12.67038,...,15.91125,0.001367,4.383411,3.806463,2.496994,1.278003,4.172625,16.44155,5.797484,9.348187


## Linear Model

In [280]:
def get_predictor_combo(pred_set):
    pred_formula = ""
    ind_vars = list()
    for col in pred_set:
        if col != "deaths" and col != "log_deaths":
            ind_vars.append(col)
    return " + ".join(ind_vars)

def get_linear_formula(pred_set, dep_var_name):
    pred_formula = get_predictor_combo(pred_set)
    form = dep_var_name + " ~ " + pred_formula
    return re.sub(" +", " ", form)

def get_form_w_2ndO_interactions(pred_set, dep_var_name):
    main_effects = get_predictor_combo(pred_set)
    interactions = set()
    for x in range(len(pred_set)):
        for y in range(x + 1, len(pred_set)):
            a = pred_set[x] + ":" + pred_set[y]
            b = pred_set[y] + ":" + pred_set[x]
            if a not in interactions and b not in interactions:
                interactions.add(a)
    pred_formula = main_effects if len(interactions) == 0 else main_effects + " + " + " + ".join(list(interactions))
    form = dep_var_name + " ~ " + pred_formula
    return re.sub(" +", " ", form)

def get_var_names_from_formula(formula):
    indp = re.sub("^.+ ~ ", "", formula)
    return indp.split(" + ")

In [117]:
formulas = list()
for var_set in indp_vars:
    formulas.append(get_linear_formula(var_set, "deaths"))

lrms = list()
for formula in formulas:
    lrm = sm.ols(formula = formula, data=abbrev).fit()
    lrms.append(lrm)

In [118]:
print(lrms[0].summary())

                            OLS Regression Results                            
Dep. Variable:                 deaths   R-squared:                       0.541
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     47.26
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          8.42e-121
Time:                        11:22:21   Log-Likelihood:                -5794.9
No. Observations:                 823   AIC:                         1.163e+04
Df Residuals:                     802   BIC:                         1.173e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              38.0717    

In [55]:
print(lrms[1].summary())

                            OLS Regression Results                            
Dep. Variable:                 deaths   R-squared:                       0.729
Model:                            OLS   Adj. R-squared:                  0.722
Method:                 Least Squares   F-statistic:                     97.92
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          1.60e-209
Time:                        10:36:21   Log-Likelihood:                -5577.7
No. Observations:                 823   AIC:                         1.120e+04
Df Residuals:                     800   BIC:                         1.131e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              38.0717    

In [56]:
print(lrms[2].summary())

                            OLS Regression Results                            
Dep. Variable:                 deaths   R-squared:                       0.347
Model:                            OLS   Adj. R-squared:                  0.331
Method:                 Least Squares   F-statistic:                     21.31
Date:                Thu, 23 Apr 2020   Prob (F-statistic):           3.74e-61
Time:                        10:37:40   Log-Likelihood:                -5939.9
No. Observations:                 823   AIC:                         1.192e+04
Df Residuals:                     802   BIC:                         1.202e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [57]:
print(lrms[3].summary())

                            OLS Regression Results                            
Dep. Variable:                 deaths   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.344
Method:                 Least Squares   F-statistic:                     20.56
Date:                Thu, 23 Apr 2020   Prob (F-statistic):           1.89e-63
Time:                        10:38:28   Log-Likelihood:                -5930.9
No. Observations:                 823   AIC:                         1.191e+04
Df Residuals:                     800   BIC:                         1.202e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

### Linear Model, Transformed Target

In [119]:
formulas = list()
for var_set in indp_vars:
    formulas.append(get_linear_formula(var_set, "log_deaths"))

lrms = list()
for formula in formulas:
    lrm = sm.ols(formula = formula, data=abbrev).fit()
    lrms.append(lrm)

In [120]:
print(lrms[0].summary())

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.627
Model:                            OLS   Adj. R-squared:                  0.618
Method:                 Least Squares   F-statistic:                     67.53
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          1.43e-156
Time:                        11:22:54   Log-Likelihood:                -1097.9
No. Observations:                 823   AIC:                             2238.
Df Residuals:                     802   BIC:                             2337.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               1.7137    

In [75]:
print(lrms[1].summary())

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.630
Model:                            OLS   Adj. R-squared:                  0.619
Method:                 Least Squares   F-statistic:                     61.82
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          9.21e-156
Time:                        10:51:03   Log-Likelihood:                -1095.4
No. Observations:                 823   AIC:                             2237.
Df Residuals:                     800   BIC:                             2345.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               1.7137    

In [76]:
print(lrms[2].summary())

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.662
Model:                            OLS   Adj. R-squared:                  0.654
Method:                 Least Squares   F-statistic:                     78.67
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          1.62e-173
Time:                        10:51:59   Log-Likelihood:                -1057.4
No. Observations:                 823   AIC:                             2157.
Df Residuals:                     802   BIC:                             2256.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [77]:
print(lrms[3].summary())

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.667
Model:                            OLS   Adj. R-squared:                  0.658
Method:                 Least Squares   F-statistic:                     72.77
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          6.96e-174
Time:                        10:53:08   Log-Likelihood:                -1051.9
No. Observations:                 823   AIC:                             2150.
Df Residuals:                     800   BIC:                             2258.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

### Linear Model, Main Effects + 2nd Order Interactions

In [121]:
formulas = list()
for var_set in indp_vars:
    formulas.append(get_form_w_2ndO_interactions(var_set, "deaths"))

lrms = list()
for formula in formulas:
    lrm = sm.ols(formula = formula, data=abbrev).fit()
    lrms.append(lrm)

In [122]:
print(lrms[0].summary())

                            OLS Regression Results                            
Dep. Variable:                 deaths   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.991
Method:                 Least Squares   F-statistic:                     452.2
Date:                Thu, 23 Apr 2020   Prob (F-statistic):               0.00
Time:                        11:23:19   Log-Likelihood:                -4039.5
No. Observations:                 823   AIC:                             8499.
Df Residuals:                     613   BIC:                             9489.
Df Model:                         209                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

In [155]:
print("{} variables with significant coefficients\n".format(len(lrms[0].pvalues[lrms[0].pvalues < 0.05])))

names = get_var_names_from_formula(formulas[0])
names.insert(0, "Intercept")

print("{:50} {:>6} {:>8}".format("VARIABLE", "P-VAL", "COEFF"))
for a,b,c in zip(lrms[0].pvalues, names, lrms[0].params):
    if abs(a) < 0.05:
        print("{:50} {:>6.3f} {:>8.3f}".format(b, a, c))

52 variables with significant coefficients

VARIABLE                                            P-VAL    COEFF
Intercept                                           0.000   28.996
household_size                                      0.036    6.784
empl_agriculture                                    0.001  -25.746
empl_professional                                   0.003  -12.556
prc_fam_poverty                                     0.000   18.967
avg_income                                          0.023   12.593
prc_public_transp                                   0.016  -23.098
population                                          0.000   80.370
domestic_passengers                                 0.001  -32.301
intl_passengers                                     0.000  122.723
ten_plus                                            0.024    8.986
empl_services:domestic_passengers                   0.000   33.290
empl_manufacturing:prc_public_transp                0.003  -27.734
household_size:dom

In [199]:
def print_result(model, form):
    print(model.summary())
    
    print("\n{} variables with significant coefficients\n".format(len(model.pvalues[model.pvalues < 0.05])))
          
    names = get_var_names_from_formula(form)
    names.insert(0, "Intercept")
    print("{:50} {:>6} {:>8}".format("VARIABLE", "P-VAL", "COEFF"))
    for a,b,c in zip(model.pvalues, names, model.params):
        if abs(a) < 0.05:
            print("{:50} {:>6.3f} {:>8.3f}".format(b, a, c))

def print_results(models, forms, num):
    print_result(models[num], forms[num])

In [200]:
print_results(lrms, formulas, 1)

# use jupyter notebook cell options to scroll the output (right click and "enable scrolling for outputs")

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.710
Method:                 Least Squares   F-statistic:                     9.288
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          6.34e-106
Time:                        12:13:24   Log-Likelihood:                -850.36
No. Observations:                 823   AIC:                             2189.
Df Residuals:                     579   BIC:                             3339.
Df Model:                         243                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

In [164]:
print_results(lrms, formulas, 2)

                            OLS Regression Results                            
Dep. Variable:                 deaths   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.932
Method:                 Least Squares   F-statistic:                     55.01
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          7.71e-301
Time:                        11:43:12   Log-Likelihood:                -4885.1
No. Observations:                 823   AIC:                         1.019e+04
Df Residuals:                     612   BIC:                         1.119e+04
Df Model:                         210                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

In [165]:
print_results(lrms, formulas, 3)

                            OLS Regression Results                            
Dep. Variable:                 deaths   R-squared:                       0.975
Model:                            OLS   Adj. R-squared:                  0.964
Method:                 Least Squares   F-statistic:                     92.04
Date:                Thu, 23 Apr 2020   Prob (F-statistic):               0.00
Time:                        11:44:22   Log-Likelihood:                -4598.9
No. Observations:                 823   AIC:                             9688.
Df Residuals:                     578   BIC:                         1.084e+04
Df Model:                         244                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

## Linear Model, Main Effects + 2nd Order Interactions, Transformed Target

In [166]:
formulas = list()
for var_set in indp_vars:
    formulas.append(get_form_w_2ndO_interactions(var_set, "log_deaths"))

lrms = list()
for formula in formulas:
    lrm = sm.ols(formula = formula, data=abbrev).fit()
    lrms.append(lrm)

In [167]:
print_results(lrms, formulas, 0)

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.782
Model:                            OLS   Adj. R-squared:                  0.708
Method:                 Least Squares   F-statistic:                     10.52
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          3.33e-115
Time:                        11:45:46   Log-Likelihood:                -877.49
No. Observations:                 823   AIC:                             2175.
Df Residuals:                     613   BIC:                             3165.
Df Model:                         209                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

In [168]:
print_results(lrms, formulas, 1)

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.710
Method:                 Least Squares   F-statistic:                     9.288
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          6.34e-106
Time:                        11:46:43   Log-Likelihood:                -850.36
No. Observations:                 823   AIC:                             2189.
Df Residuals:                     579   BIC:                             3339.
Df Model:                         243                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

In [169]:
print_results(lrms, formulas, 2)

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.776
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     10.08
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          2.43e-111
Time:                        11:47:35   Log-Likelihood:                -888.97
No. Observations:                 823   AIC:                             2200.
Df Residuals:                     612   BIC:                             3194.
Df Model:                         210                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

In [337]:
def backward_selected(data, formula):
    """adapted from https://planspace.org/20150423-forward_selection_with_statsmodels/
    """
    had_reduction = True
    remaining = set(get_var_names_from_formula(formula))
    removed = list()
    
    current_score, best_new_score = -np.inf, -np.inf
    working_formula = re.sub(" +", " ", formula)
    
    a = 0
    while had_reduction and a < 5:# and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            temp_formula = re.sub(("\+? " + candidate.strip() + " "), "", working_formula)
            temp_formula = re.sub(("\+ +" + candidate + "$"), "", working_formula)
            score = sm.ols(temp_formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            had_reduction = True
            removed.append((best_candidate, best_new_score))
            current_score = best_new_score
            working_formula = re.sub("\+? " + best_candidate + " ", " ", working_formula)
            working_formula = re.sub("\+? " + best_candidate + "$", " ", working_formula)
            working_formula = re.sub("~ \+", "~", working_formula)
        else:
            had_reduction = False
        a = a + 1
    
    formula = working_formula
    model = sm.ols(formula, data).fit()
    print("{} variable(s) removed".format(len(removed)))
    return (model, removed, formula)

In [331]:
# iteratively remove non-significant variables
def remove_nonsig(data, form):
    removed = list()
    removed_intercept = False
    max_pvalue = 0.9999
    working_formula = form
    working_formula = re.sub("~ \+", "~", working_formula)
    
    total_vars = len(get_var_names_from_formula(formula))
    while max_pvalue >= 0.05 and total_vars >= 1:
        mod = sm.ols(working_formula, data).fit()
        
        max_pvalue = max(mod.pvalues)
        feats = get_var_names_from_formula(working_formula)
        if not removed_intercept:
            feats.insert(0, "Intercept")
        var_name = feats[list(mod.pvalues).index(max(mod.pvalues))]
        removed.append((var_name.strip(), max_pvalue))
        if var_name == "Intercept":
            print("intercept\t", removed_intercept)
            removed_intercept = True
            matched = re.match("\d$", working_formula)
            if matched:
                continue
            else:
                working_formula = working_formula + " -1"
        else:
            working_formula = re.sub("\+? "+var_name + " ", " ", working_formula)
            working_formula = re.sub("\+? "+var_name+"$", "", working_formula).strip()
            working_formula = re.sub("~ \+", "~", working_formula)
        total_vars = len(get_var_names_from_formula(working_formula))
        #print("\t", len(working_formula), "\t", working_formula[0:30])
        
    print("{} variables removed".format(len(removed)))
    mod = sm.ols(working_formula, data).fit()    
    return (mod, removed, working_formula)       

In [338]:
step, path, new_form = backward_selected(abbrev, formulas[2])

1 variable(s) removed


In [339]:
red, path2, new_form2 = remove_nonsig(abbrev, new_form)

172 variables removed


In [341]:
print(red.summary())

                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.724
Model:                            OLS   Adj. R-squared:                  0.711
Method:                 Least Squares   F-statistic:                     55.65
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          3.29e-192
Time:                        14:46:35   Log-Likelihood:                -974.48
No. Observations:                 823   AIC:                             2025.
Df Residuals:                     785   BIC:                             2204.
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Inte

In [342]:
step, path, new_form = backward_selected(abbrev, formulas[0])
red, path2, new_form2 = remove_nonsig(abbrev, new_form)
print(red.summary())

1 variable(s) removed
160 variables removed
                            OLS Regression Results                            
Dep. Variable:             log_deaths   R-squared:                       0.738
Model:                            OLS   Adj. R-squared:                  0.721
Method:                 Least Squares   F-statistic:                     44.35
Date:                Thu, 23 Apr 2020   Prob (F-statistic):          2.18e-190
Time:                        14:48:24   Log-Likelihood:                -953.63
No. Observations:                 823   AIC:                             2007.
Df Residuals:                     773   BIC:                             2243.
Df Model:                          49                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------