In [41]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from math import sqrt
from IPython.display import display_html
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn import datasets
import pickle

pd.set_option('display.float_format', lambda x: '%.5f' % x)

pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 20)

def frange(start, stop, step):
    i = start
    while i < stop:
        yield round(i, ndigits=2)
        i += step
        
def save_model(model, filepath):
    with open(filepath, 'wb') as file:
        pickle.dump(model, file)
        
def load_model(filepath):
    with open(filepath, 'rb') as file:
        return pickle.load(file)

# Data Preperation

***I remove excess columns that ***

In [2]:
cases_US = pd.read_csv("../data/US_all_vars.csv").iloc[:,1:]
cases_US['Date'] = pd.to_datetime(cases_US['Date'], cache=True)
cases_US['Date'] = cases_US['Date'].apply(lambda x:x.toordinal())
cases_US = cases_US.drop(['FIPS', 'Country_Region', 'Total_Cases', 'County', 'State', 'County_FIPS',
                          'Phase.0', 'Phase.1', 'Phase.2', 'Phase.3', 'Abbreviation', 'Month',
                          'Lat', 'Long'],axis=1).dropna()

In [3]:
pd.set_option("display.max_rows", None)
def calc_vif(X, thresh=5.0):
    # Calculating VIF
    X_numeric = X.select_dtypes(['float64', 'int64'])
    vif = pd.DataFrame()
    variables = list(range(X_numeric.shape[1]))
    dropped = pd.DataFrame(columns=['variable','VIF'])
    
    while True:
        vif = pd.DataFrame()
        vif["variable"] = X_numeric.iloc[:, variables].columns
        vif["VIF"] = [variance_inflation_factor(X_numeric.iloc[:, variables].values, i) 
                  for i in range(X_numeric.iloc[:, variables].shape[1])]
        max_i = vif['VIF'].idxmax()
        if(vif['VIF'].max() > thresh):
            dropped = dropped.append({
                'variable': X_numeric.iloc[:, variables].columns[max_i],
                'VIF': vif.loc[max_i, "VIF"]
            }, ignore_index=True)
            variables.pop(max_i)
            continue
        break

    return vif, dropped

vif, dropped = calc_vif(cases_US)

In [4]:
df1_styler = (vif.style
                  .set_table_attributes("style='display:inline'")
                  .set_caption('US VIF Values'))
df2_styler = (dropped.style
                  .set_table_attributes("style='display:inline'")
                  .set_caption('Variables removed for high multicolinearity'))

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,variable,VIF
0,Current_Phase,1.079262
1,Cases_2W,2.592251
2,Cases_Delta,2.342335
3,Protest_Count,3.35517
4,Perc.Over.65,3.693477
5,Perc.Black,1.337205
6,Perc.Native,1.439563
7,Perc.Asian,2.611873
8,Perc.Pac.Island,1.729726
9,Perc.Mixed,4.953321

Unnamed: 0,variable,VIF
0,Date,2068758.493965
1,Perc.White,656.103652
2,Perc.Female,263.35422
3,Avg.Person.Per.Household,36.226167
4,Avg_Temp,20.635085
5,Perc.Foreign.Born,7.35821


In [5]:
drop = list(dropped['variable'])
drop.append('Cases_2W')
x, y = cases_US.drop(drop,axis=1).select_dtypes(['float64', 'int64']), cases_US['Cases_2W']

# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=109) 

In [6]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'bootstrap': bootstrap}

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100,
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 52.3min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 191.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_state=N

In [17]:
%store rf_random

Stored 'rf_random' (RandomizedSearchCV)


In [18]:
%store -r rf_random
model = rf_random.best_estimator_

In [43]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    print('Model Performance')
    print('Average RMSE: {:0.4f} cases.'.format(sqrt(mean_squared_error(predictions, test_labels))))
    print('Average MAE: {:0.4f} cases.'.format(mean_absolute_error(predictions, test_labels)))
    print('R2 = {:0.2f}%.'.format(r2_score(predictions, test_labels)))

evaluate(model, X_test, y_test)

Model Performance
Average RMSE: 357.8441 cases.
Average MAE: 41.0963 cases.
R2 = 0.96%.


In [53]:
# save_model(rf_random, "../models/random_forest_cv.sav")
# temp = load_model("../models/random_forest_cv.sav")
temp.cv_results_

KeyError: 'split0_test_precision'

In [27]:
importances = list(model.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(cases_US.columns), importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Restriction Rating   Importance: 0.32
Variable: Date                 Importance: 0.18
Variable: Governer.Party       Importance: 0.16
Variable: Perc.Female          Importance: 0.11
Variable: Perc.Native          Importance: 0.08
Variable: Avg_Temp             Importance: 0.03
Variable: Perc.Black           Importance: 0.03
Variable: Current_Phase        Importance: 0.02
Variable: Cases_2W             Importance: 0.02
Variable: Perc.Over.65         Importance: 0.02
Variable: Perc.White           Importance: 0.02
Variable: Cases_Delta          Importance: 0.01
Variable: Protest_Count        Importance: 0.01
