# Modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedKFold
from sklearn.metrics import r2_score, mean_squared_error as mse, mean_absolute_error as mae, SCORERS
from sklearn.dummy import DummyRegressor
from catboost import Pool, CatBoostRegressor, cv
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp, Trials

In [None]:
#load dataframe REGULAR
#parse datetime column
#df=pd.read_csv('COVID19_modeling.csv', parse_dates=[0])
#df.set_index('date', inplace= True)
#df.drop(columns='Unnamed: 0', inplace=True)

In [2]:
#load dataframe OHE
#parse datetime column
df_ohe = pd.read_csv('ohe_data.csv', parse_dates=['date'])
df_ohe.drop(columns='Unnamed: 0', inplace=True)
df_ohe.set_index('date', inplace=True)

In [3]:
df_ohe.head()

Unnamed: 0_level_0,Avg_Temp(F),Conf_Cases,day_of_week,day_of_year,Year,Month,Day,new_case_percent_pop*,state_id_CT,state_id_MA,state_id_ME,state_id_VT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-03-01,26.42,1.0,6,61,2020,3,1,0.014225,0,1,0,0
2020-03-02,36.5,1.0,0,62,2020,3,2,0.014225,0,1,0,0
2020-03-03,55.94,1.0,1,63,2020,3,3,0.014225,0,1,0,0
2020-03-04,46.94,2.0,2,64,2020,3,4,0.02845,0,1,0,0
2020-03-05,42.98,8.0,3,65,2020,3,5,0.113799,0,1,0,0


## Train-test split

Before training any models on the data, let's do a train-test split to keep training and testing data consistent and separated.

In [None]:
# Create features
#X, y = df_ohe.drop(columns=['Conf_Cases', 'new_case_percent_pop*']), df_ohe['new_case_percent_pop*']
# Make test and train split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [4]:
len(df_ohe)

2224

In [5]:
bins = np.linspace(0, 2224, 200)

In [6]:
# Create features
X, y = df_ohe.drop(columns=['Conf_Cases', 'new_case_percent_pop*']), df_ohe['new_case_percent_pop*']

In [7]:
y_binned = np.digitize(y, bins)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y_binned, random_state=42)

In [None]:
# Make test and train split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify =y_binned, random_state=42)

## Using the mean as a baseline prediction model

Previously, we determined the R2 score of using the mean to predict COVID19 cases for each individual state. Let's do the same thing now that we have all the states in one DataFrame, so that we will have a baseline "dummy" model to compare our future optimized models to. 

In [None]:
dummy_mean = DummyRegressor()
# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)
# Get R2 score
score_dummy = dummy_mean.score(X_test, y_test)
print("The R2 score of using the mean to predict COVID19 cases in our states is:", score_dummy)

Let's store the evaluation metric values for the dummy regressor so we can compare them with our future models. 

In [None]:
dummy_pred = dummy_mean.predict(X_test)
dummy_r2 = r2_score(y_test, dummy_pred)
dummy_mse = mse(y_test, dummy_pred)
dummy_rmse = np.sqrt(mse(y_test, dummy_pred))

## Tuning the top performing models for ensemble model

In the pre-processing step, we determined (with the help of Pycaret) that our top performing models were **CatBoost Regressor**, **Random Forest Regressor**, and **Extra Trees Regressor**. Let's now fine tune the hyperparameters of each of these models, in preparation for feeding them into the pipeline of the Voting Regressor. 

**NOTE TO SELF: Determine hyperparameters for each model, but pass unfitted (tuned) models to the VotingRegressor**


### 1. Random Forest Regressor
With a few exceptions, a RandomForestClassifier has all the hyperparameters of a DecisionTreeClassifier (to control how trees are grown), plus all the hyperparameters of a BaggingClassifier to control the ensemble itself. I'll explore a couple different ways to determine optimal hyperparameters and will choose those that produce the best R2 and MSE for the final ensemble model (a VotingRegressor).

In [None]:
# Define parameters to search for GridSearchCV
basic_param_grid = {'n_estimators': [100, 300, 500, 900, 1200],
              'max_depth': [3, 5, 20, 50, 100],
              }
# Instantiate RandomForestRegressor
basic_rf = RandomForestRegressor(random_state=42)
cv_rf = GridSearchCV(basic_rf, basic_param_grid, cv = 5)
cv_rf_fit = cv_rf.fit(X_train, y_train)

In [None]:
print('The optimal max_depth for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['max_depth']))
print('The optimal n_estimators for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['n_estimators']))

In [None]:
# Instantiate RFR with optimal hyperparameters
basic_rf = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=42)
basic_rf.fit(X_train, y_train)
basic_rf_pred = basic_rf.predict(X_test)

In [None]:
# Calculate evaluation metrics on basic_rf
basic_rf_r2 = r2_score(y_test, basic_rf_pred)
basic_rf_mse = mse(y_test, basic_rf_pred)
basic_rf_rmse = np.sqrt(basic_rf_mse)

In [None]:
rf_results = pd.DataFrame({'Model':['dummy_reg', 'basic_rf'], 'R2': [dummy_r2, basic_rf_r2], 'MSE':[dummy_mse, basic_rf_mse], 'RMSE':[dummy_rmse, basic_rf_rmse]})

In [None]:
rf_results

***

#### Using RandomSearchCV to find optimal hyperparameter values

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 120, num = 11)]
max_depth.append(None)
#max_depth.append(9)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, n_jobs=-1, random_state=42)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
# Instantiate RFR with optimal hyperparameters
rf2 = RandomForestRegressor(n_estimators= 900,
                             min_samples_split= 2,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= 90,
                             bootstrap= True, 
                             random_state=42)
rf2.fit(X_train, y_train)
rf_pred2 = rf2.predict(X_test)

In [None]:
rf_random_r2 = r2_score(y_test, rf_pred2)
rf_random_mse = mse(y_test, rf_pred2)
rf_random_rmse = np.sqrt(mse(y_test, rf_pred2))

In [None]:
rf_results.loc[len(rf_results)]= ['rf_randomCV', 
                                 r2_score(y_test, rf_pred2),
                                 mse(y_test, rf_pred2),
                                 np.sqrt(mse(y_test, rf_pred2))]

In [None]:
rf_results

***

### Using GridSearchCV to find the optimal hyperparameter values

In [None]:
# Create the parameter grid based on the results of random search 
GS_param_grid = {
    'bootstrap': [True],
    'max_depth': [70, 80, 90, 100, 110],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3],
    'n_estimators': [700, 800, 900, 1000, 1100]
}

In [None]:
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = GS_param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
# Instantiate RFR with optimal hyperparameters
rf3 = RandomForestRegressor(n_estimators= 800,
                             min_samples_split= 3,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= 90,
                             bootstrap= True)
rf3.fit(X_train, y_train)
rf_pred3 = rf3.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the RandomForestRegressor with GSCV tuning is {}'.format(r2_score(y_test, rf_pred3)))

In [None]:
print('The MSE of the RandomForestRegressor with GSCV tuning is {}'.format(mse(y_test, rf_pred3)))

In [None]:
rf_results.loc[len(rf_results)]= ['rf_gridCV', 
                                 r2_score(y_test, rf_pred3),
                                 mse(y_test, rf_pred3),
                                 np.sqrt(mse(y_test, rf_pred3))]

In [None]:
rf_results

### Bayesian Optimization with hyperopt

In [None]:
#pip install hyperopt

In [None]:
trials = Trials()

In [None]:
space = {
    "n_estimators": hp.choice("n_estimators", [400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400]),
    "max_depth": hp.quniform("max_depth", 30, 120, 10),
}

In [None]:
# define objective function

def hyperparameter_tuning(params):
    rf_reg = RandomForestRegressor(**params,n_jobs=-1)
    r2 = cross_val_score(rf_reg, X_train, y_train ,scoring="r2").mean()
    return -r2

In [None]:
best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
# Instantiate RFR with optimal hyperparameters
rf4 = RandomForestRegressor(n_estimators= 400,
                             max_depth= 80)
                             
rf4.fit(X_train, y_train)
rf_pred4 = rf4.predict(X_test)

In [None]:
rf_results.loc[len(rf_results)]= ['rf_bayes_hyperopt', 
                                 r2_score(y_test, rf_pred4),
                                 mse(y_test, rf_pred4),
                                 np.sqrt(mse(y_test, rf_pred4))]

In [None]:
rf_results

**?** $\star$ It looks like the optimal hyperparameters chosen by RandomSearchCV performed the best, both in terms of the R2 and mean squared error, so we'll use those hyperparameters (as defined in `rf2`) in our final Voting Regressor ensemble.

***

## 2. Extra Trees Regressor

### Using basic hyperparameter tuning only

In [None]:
# Define parameters to search for GridSearchCV
param_grid = {'n_estimators': [100, 300, 500, 900, 1200],
              'max_depth': [3, 5, 20, 50, 100],
              }
# Instantiate ExtraTreesRegressor
et = ExtraTreesRegressor(random_state=42)
cv_et = GridSearchCV(et, param_grid, cv = 5)
cv_et_fit = cv_et.fit(X_train, y_train)

#Print the best parameter
print(cv_et_fit.best_params_)

In [None]:
print('The optimal max_depth for the ExtraTreesRegressor is: {}'.format(cv_et_fit.best_params_['max_depth']))
print('The optimal n_estimators for the ExtraTreesRegressor is: {}'.format(cv_et_fit.best_params_['n_estimators']))

In [None]:
# Instantiate ETR with optimal hyperparameters
et_basic = RandomForestRegressor(max_depth = 50, n_estimators = 1200)
et_basic.fit(X_train, y_train)
et_basic_pred = et_basic.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the basic Extra Trees Regressor is {}'.format(r2_score(y_test, et_basic_pred)))

In [None]:
print('The MSE of the basic ExtraTreesRegressor is {}'.format(mse(y_test, et_basic_pred)))

In [None]:
basic_et_r2 = r2_score(y_test, et_basic_pred)
basic_et_mse = mse(y_test, et_basic_pred)
basic_et_rmse = np.sqrt(mse(y_test, et_basic_pred))

In [None]:
et_results = pd.DataFrame({'Model':['dummy_reg', 'basic_et'], 'R2': [dummy_r2, basic_et_r2], 'MSE':[dummy_mse, basic_et_mse], 'RMSE':[dummy_rmse, basic_et_rmse]})

In [None]:
et_results

***

### Using RandomSearchCV to find optimal hyperparameter values

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(30, 140, num = 12)]
max_depth.append(None)
#max_depth.append(9)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 10]
# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
random_grid_et = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
et = ExtraTreesRegressor(random_state=42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
et2 = RandomizedSearchCV(estimator = et, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
et2.fit(X_train, y_train)

In [None]:
et2.best_params_

In [None]:
# Instantiate RFR with optimal hyperparameters
et_random = ExtraTreesRegressor(n_estimators= 300,
                             min_samples_split= 2,
                             min_samples_leaf= 1,
                             max_features= 'auto',
                             max_depth= 120,
                             bootstrap= True)
et_random.fit(X_train, y_train)
et_random_pred = et_random.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the ExtraTreesRegressor with RSCV tuning is {}'.format(r2_score(y_test, et_random_pred)))

In [None]:
print('The MSE of the ExtraTreesRegressor with RSCV tuning is {}'.format(mse(y_test, et_random_pred)))

In [None]:
et_results.loc[len(et_results)]= ['et_randomCV', 
                                 r2_score(y_test, et_random_pred),
                                 mse(y_test, et_random_pred),
                                 np.sqrt(mse(y_test, et_random_pred))]

In [None]:
et_results

#### Using GridSearchCV to find the optimal hyperparamter values

In [None]:
# Create the parameter grid based on the results of random search 
GS_param_grid_et = {
    'bootstrap': [True],
    'max_depth': np.linspace(40, 150, 12),
    'max_features': ['auto'],
    'min_samples_leaf': [2, 3],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [300, 500, 800, 1000, 1200, 1600]
}

In [None]:
# Create a base model
et = ExtraTreesRegressor()
# Instantiate the grid search model
et3 = GridSearchCV(estimator = et, param_grid = GS_param_grid_et, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
et3.fit(X_train, y_train)

In [None]:
et3.best_params_

In [None]:
# Instantiate RFR with optimal hyperparameters
et_grid = ExtraTreesRegressor(n_estimators= 500,
                             min_samples_split= 4,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= 70,
                             bootstrap= True)
et_grid.fit(X_train, y_train)
et_grid_pred = et_grid.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the Extra Trees Regressor with GSCV is {}'.format(r2_score(y_test, et_grid_pred)))

In [None]:
print('The MSE of the ExtraTreesRegressor with GSCV is {}'.format(mse(y_test, et_grid_pred)))

In [None]:
et_results.loc[len(et_results)]= ['et_gridCV', 
                                 r2_score(y_test, et_grid_pred),
                                 mse(y_test, et_grid_pred),
                                 np.sqrt(mse(y_test, et_grid_pred))]

In [None]:
et_results 

### Bayestian Optimization with hyperopt

In [None]:
trials_et = Trials()

In [None]:
space_et = {
    "n_estimators": hp.choice("n_estimators", [400, 500, 600, 700, 800, 900, 1000, 1100, 1200]),
    "max_depth": hp.quniform("max_depth", 30, 120, 10),
}

In [None]:
best_et = fmin(
    fn=hyperparameter_tuning,
    space = space_et, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials_et
)

print("Best: {}".format(best))

In [None]:
# Instantiate RFR with optimal hyperparameters
et4 = ExtraTreesRegressor(n_estimators= 400,
                             max_depth= 80)
                             
et4.fit(X_train, y_train)
et_pred4 = et4.predict(X_test)

In [None]:
et_results.loc[len(rf_results)]= ['et_bayes_hyperopt', 
                                 r2_score(y_test, et_pred4),
                                 mse(y_test, et_pred4),
                                 np.sqrt(mse(y_test, et_pred4))]

In [None]:
et_results

It seems that here the parameters suggested by RandomizedSearchCV (stored in `et_randomCV`) produced the best MSE and R2, so we'll use the hyperparameters from `et_randomCV`.

## 3. CatBoost Regressor

CatBoost is an open-sourced gradient boosting library. 
* 1. run CatBoost without tuning any hyperparameters to get baseline mse and r2
* 2. RandomizedSearchCV for hyperparameters
* 3. GridSearchCV for hyperparameters

* [Pool documentation](https://catboost.ai/en/docs/concepts/python-reference_pool)
* [CatboostRegressor documentation](https://catboost.ai/en/docs/concepts/python-reference_catboostregressor)

### WITHOUT POOL

In [None]:
cat_features_names = ['Month', 'day_of_week', 'state_id_MA', 'state_id_CT', 'state_id_VT', 'state_id_ME'] # here we specify names of categorical features
cat_features = [X.columns.get_loc(col) for col in cat_features_names]
print(cat_features)

In [None]:
params = {'loss_function':'RMSE',
          'eval_metric':'R2',
          'cat_features': cat_features,
          'verbose': 200,
          'early_stopping_rounds': 200,
          'random_seed': 42
         }
cbc_1 = CatBoostRegressor(**params)
cbc_1.fit(X_train, y_train,
          eval_set=(X_test, y_test),
          use_best_model=True,
          plot=True
         );

In [None]:
pred_cbc1= cbc_1.predict(X_test)

In [None]:
r2_cbc1 = r2_score(y_test, pred_cbc1)
rmse_cbc1 = np.sqrt(mse(y_test, pred_cbc1))
mse_cbc1 = mse(y_test, pred_cbc1)

In [None]:
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse_cbc1))
print('MSE: {:.2f}'.format(mse_cbc1))
print('R2: {:.2f}'.format(r2_cbc1))

### WITH POOL

In [None]:
train_dataset_cbc = Pool(X_train, y_train) 
test_dataset_cbc = Pool(X_test, y_test)

In [None]:
model_cbc2 = CatBoostRegressor(loss_function='RMSE', random_seed=42)

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

In [None]:
model_cbc2.grid_search(grid, train_dataset)

In [None]:
pred_cbc2 = model_cbc2.predict(X_test)
rmse_cbc2 = (np.sqrt(mse(y_test, pred_cbc2)))
r2_cbc2 = r2_score(y_test, pred_cbc2)
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse_cbc2))
print('R2: {:.2f}'.format(r2_cbc2))

In [None]:
model_cbc2_params = {'depth': 8,
  'iterations': 150,
  'learning_rate': 0.1,
  'l2_leaf_reg': 0.5}

In [None]:
print(model_cbc2_params)

### WITH CV

In [None]:
model_cbc3 = CatBoostRegressor(random_seed=42)

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1, 0.3],
        'depth': [4, 6, 8, 10, 12],
        'l2_leaf_reg': [0.2, 0.5, 1, 3],
        'loss_function':'RMSE',
        'eval_metric':'R2',
        'cat_features': cat_features,
        }

In [None]:
model_cbc3.grid_search(grid, train_dataset)

In [None]:
train_dataset = Pool(X_train, y_train) 
test_dataset = Pool(X_test, y_test)

In [None]:
params_cv = {'loss_function':'RMSE',
          'eval_metric':'R2',
          'cat_features': cat_features, 
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 42
         }
model_cv = CatBoostRegressor(**params)

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.001, 0.03, 0.1],
        'depth': [2, 4, 6, 8, 50, 100],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

In [None]:
res = model_cv.grid_search(grid, train_dataset)

In [None]:
res

In [None]:
params_final = {'loss_function':'RMSE',
          'eval_metric':'R2',
          'cat_features': cat_features, 
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 42,
          'max_depth' : 8,
          'iterations' : 150,
          'learning_rate' : 0.1,
          'l2_leaf_reg': 0.5
         }

In [None]:
cb_model_tuned = CatBoostRegressor(**params)

In [None]:
cb_model_tuned.fit(X_train, y_train,
                   eval_set=(X_test, y_test))

In [None]:
pred_z = cb_model_tuned.predict(X_test)

In [None]:
rmse_z = (np.sqrt(mse(y_test, pred_z)))
r2_z = r2_score(y_test, pred_z)
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse_z))
print('R2: {:.2f}'.format(r2_z))

In [None]:
pred = model.predict(X_test)
rmse = (np.sqrt(mse(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

In [None]:
y_test.dtypes

In [None]:
X_train.head()

### SOMETHING ELSE WITH CV

In [None]:
train_data = Pool(data=X_train,
                  label=y_train,
                  cat_features=cat_features
                 )

test_data = Pool(data=X_test,
                  label=y_test,
                  cat_features=cat_features
                 )

In [None]:
params = {'loss_function':'RMSE',
          'eval_metric':'R2',
#           'cat_features': cat_features, # we don't need to specify this parameter as 
#                                           pool object contains info about categorical features
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 42
         }

cbc_7 = CatBoostClassifier(**params)
cbc_7.fit(train_data, # instead of X_train, y_train
          eval_set=_data, # instead of (X_valid, y_valid)
          use_best_model=True, 
          plot=True
         );

In [None]:
#print(cbc_7.get_all_params())

In [None]:
'''
params = {'loss_function':'RMSE',
          'eval_metric':'R2',
          'verbose': 200,
          'random_seed': 42
         }

all_train_data = Pool(data=X_train,
                      label=y_train,
                      cat_features=cat_features
                     )

scores = cv(
   params=params,
   dtrain=train_data,
   iterations=None,
   num_boost_round=None,
   fold_count=5,
   nfold=None,
   inverted=False,
   partition_random_seed=0,
   seed=42,
   shuffle=True,
   stratified=None,
   as_pandas=True,
   metric_period=None,
   verbose=None,
   verbose_eval=None,
   plot=True,
   early_stopping_rounds=200,
   folds=None,
   type='Classical',
   return_models=False)
'''

In [None]:
#scores

In [None]:
cbc_7.get_feature_importance(prettified=True)

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
df_ohe_features = df_ohe.drop(columns=['Conf_Cases', 'new_case_percent_pop*'])

In [None]:
sorted_feature_importance = model.feature_importances_.argsort()
plt.barh(df_ohe_features.columns[sorted_feature_importance], 
        model.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel('CatBoost Feature Importance')

## Shap value plot

Optimized RG model SHAP value plots:
    * Dot represents feature importance for one sample
    * Right $\Rightarrow$ feature increased probability of positive label
    * Left $\Rightarrow$ feature decreased probability of positivity label
    * Red $\Rightarrow$ high feature value
    * Blue $\Leftarrow$ low feature value

for more info on shap plots and how to interpret: [this article](https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names = df_ohe_features.columns[sorted_feature_importance])

***

In [None]:
#load dataframe OHE
#parse datetime column
df_cb = pd.read_csv('ohe_data2.csv', parse_dates=['date'])
df_cb.drop(columns='Unnamed: 0', inplace=True)
df_cb.set_index('date', inplace=True)

In [None]:
df_cb.drop(columns=['Conf_Cases'], inplace=True)

In [None]:
# Create features
X2, y2 = df_ohe.drop(columns=['new_case_percent_pop*']), df_ohe['new_case_percent_pop*']

In [None]:
# Sort y
y_sorted2 = sorted(y2)

In [None]:
# Save your y values in a new ndarray, broken down by the bins created above.
y_binned2 = np.digitize(y2, bins)

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, stratify=y_binned2, random_state=42)

In [None]:
train_dataset2 = Pool(X_train2, y_train2) 
test_dataset2 = Pool(X_test2, y_test2)

In [None]:
ohe_all_model = CatBoostRegressor(loss_function='RMSE', random_seed=42)

In [None]:
ohe_all_model.grid_search(grid, train_dataset2, cv=5)

In [None]:
pred2 = ohe_all_model.predict(X_test2)
rmse = (np.sqrt(mse(y_test2, pred2)))
r2_2 = r2_score(y_test2, pred2)
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

In [None]:
sorted_feature_importance = ohe_all_model.feature_importances_.argsort()
plt.barh(df_cb.columns[sorted_feature_importance], 
        ohe_all_model.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel('CatBoost Feature Importance')

In [None]:
explainer2 = shap.TreeExplainer(model)
shap_values2 = explainer2.shap_values(X_test2)
shap.summary_plot(shap_values2, X_test2, feature_names = df_cb.columns[sorted_feature_importance])

## For catboost use: 'model'

****

#### Voting Regressor

In [None]:
# RandomForestRegressor
rf_reg = RandomForestRegressor(bootstrap = True,
                             max_depth = 70,
                             max_features = 'auto',
                             min_samples_leaf = 2,
                             min_samples_split = 2,
                             n_estimators = 600
)

# ExtraTreesRegressor
et_reg = ExtraTreesRegressor(n_estimators = 1700,
                             min_samples_split = 3,
                             min_samples_leaf = 1,
                             max_features = 'auto',
                             max_depth = 70,
                             bootstrap = True)

# CatBoostRegressor
cb_reg = model

In [None]:
# Create the Voting Classifier with soft voting because all models performed similarly

clf_voting = VotingRegressor(estimators = [('rf_est', rf_reg), ('et_est', et_reg),
                                           ('cb_est', cb_reg)])
#Fit and predict

clf_voting.fit(X_train, y_train)
pred_voting = clf_voting.predict(X_test)

In [None]:
print('Unweighted VotingReg R2: ', r2_score(y_test, pred_voting))
print('Unweighted VotingReg MSE: ', mse(y_test, pred_voting))
print('Unweighted VotingReg RMSE: ', np.sqrt(mse(y_test, pred_voting)))

In [None]:
# Create empty lists that will storage the different weights

weights1 = []
weights2 = []
weights3 = []
scores = []

# Create a for loop to evaluate different combinations of weights

for i in np.arange(0.1,1, 0.1):
    for j in np.arange(0.1,1, 0.1):
        for k in np.arange(0.1,1, 0.1):
            reg_voting = VotingRegressor(estimators = [('rf_est', rf_reg), ('et_est', et_reg),
                                           ('cb_est', cb_reg)], weights = [i, j, k])
            reg_voting.fit(X_train, y_train)
            pred = reg_voting.predict(X_test)
            score = r2_score(y_test, pred)
            scores.append(score)
            weights1.append(i)
            weights2.append(j)
            weights3.append(k)

In [None]:
# Save the results in a data frame

test_scores = pd.DataFrame()
test_scores['Weight1'] = weights1
test_scores['Weight2'] = weights2
test_scores['Weight3'] = weights3
test_scores['Test Score'] = scores

# Create an additional column to save the sum of all the weights

test_scores['sum_weights'] = test_scores['Weight1'].add(test_scores['Weight2']).add(test_scores['Weight3'])

#We are only getting the rows that the sum of all weights were equal to one

condition = test_scores['sum_weights'] == 1

test_scores = test_scores.loc[condition]

In [None]:
# Sort the values to see the different test scores depending on the weights
test_scores.sort_values(by = 'Test Score', ascending = False).head(15)

In [None]:
# Create the Voting Classifier with the most equally weighted because all models performed similarly

reg_voting = VotingRegressor(estimators = [('rf_est', rf_reg), ('et_est', et_reg),
                                           ('cb_est', cb_reg)], weights = [0.2, 0.4, 0.4])

#Fit and predict

reg_voting.fit(X_train, y_train)
pred_voting = reg_voting.predict(X_test)

In [None]:
R2_final = r2_score(y_test, pred_voting)
MSE_final = mse(y_test, pred_voting)
RMSE_final = np.sqrt(mse(y_test, pred_voting))

In [None]:
print('Final VotingRegressor R2: ', R2_final)
print('Final VotingRegressor MSE: ', MSE_final)
print('Final VotingRegressor RMSE: ', RMSE_final)

## Feature importance

In [None]:
X_train

In [None]:
for i, item in enumerate(reg_voting.feature_importances_):
    print("{0:s}: {1:.2f}".format(X.columns[i], item))