# Training (ML models)

In [117]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.linear_model import LinearRegression 
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR

import lightgbm as lgb

import pickle

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_new = pd.read_csv('train.csv', index_col=0)

In [4]:
df_new

Unnamed: 0,date,cases_new,state_id,cases_recovered,cases_death,cases_active,checkins,unique_ind,cumul_full,pop,percent_vax
0,2021-02-24,3545,1,3331,12,30572,19689234,7518288,3,32657400,0.00
1,2021-02-25,1924,1,3752,13,28738,20130990,7666194,5,32657400,0.00
2,2021-02-26,2253,1,3085,10,27903,20975140,7806991,8,32657400,0.00
3,2021-02-27,2364,1,3320,10,26937,22978495,7891208,14,32657400,0.00
4,2021-02-28,2437,1,3251,9,26118,22162834,7669665,19,32657400,0.00
...,...,...,...,...,...,...,...,...,...,...,...
5180,2021-12-25,802,11,1151,6,12390,7272213,3243895,4759977,6538000,0.73
5181,2021-12-25,183,12,111,1,2453,496077,260964,877237,1259300,0.70
5182,2021-12-25,159,15,326,2,3482,5404399,2379561,3019764,1773700,1.70
5183,2021-12-25,2,16,10,0,62,71513,28947,79555,99600,0.80


In [5]:
state = 17
columns = ['cases_new','cases_active','checkins','unique_ind','percent_vax']
df = df_new[df_new['state_id']==state]
df = df[columns]
df

Unnamed: 0,cases_new,cases_active,checkins,unique_ind,percent_vax
320,1,51,195780,113866,0.00
336,2,48,204087,118093,0.00
352,0,40,242938,136399,0.00
368,4,41,263675,142624,0.00
384,3,41,276508,151866,0.00
...,...,...,...,...,...
5120,35,464,274414,164392,1.33
5136,20,459,269936,162751,1.33
5152,29,455,275508,165831,1.33
5168,23,424,305446,180501,1.33


In [6]:
n_days = 10
for i in range(n_days):
    shifted_df = df.shift(i+1)
    df['cn_'+str(i+1)] = shifted_df['cases_new']
    #df['ca_'+str(i+1)] = shifted_df['cases_active']
    df['ci_'+str(i+1)] = shifted_df['checkins']
    #df['ui_'+str(i+1)] = shifted_df['unique_ind']
df = df.dropna()


In [7]:
df = df.drop(columns=['checkins', 'unique_ind'])

In [8]:
df

Unnamed: 0,cases_new,cases_active,percent_vax,cn_1,ci_1,cn_2,ci_2,cn_3,ci_3,cn_4,...,cn_6,ci_6,cn_7,ci_7,cn_8,ci_8,cn_9,ci_9,cn_10,ci_10
480,2,39,0.00,4.0,257609.0,7.0,218576.0,3.0,215573.0,1.0,...,3.0,276508.0,4.0,263675.0,0.0,242938.0,2.0,204087.0,1.0,195780.0
496,2,39,0.00,2.0,267004.0,4.0,257609.0,7.0,218576.0,3.0,...,6.0,222965.0,3.0,276508.0,4.0,263675.0,0.0,242938.0,2.0,204087.0
512,1,28,0.00,2.0,281002.0,2.0,267004.0,4.0,257609.0,7.0,...,1.0,213292.0,6.0,222965.0,3.0,276508.0,4.0,263675.0,0.0,242938.0
528,3,24,0.00,1.0,231706.0,2.0,281002.0,2.0,267004.0,4.0,...,3.0,215573.0,1.0,213292.0,6.0,222965.0,3.0,276508.0,4.0,263675.0
544,0,23,0.00,3.0,230360.0,1.0,231706.0,2.0,281002.0,2.0,...,7.0,218576.0,3.0,215573.0,1.0,213292.0,6.0,222965.0,3.0,276508.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5120,35,464,1.33,30.0,274139.0,18.0,257589.0,35.0,236159.0,28.0,...,22.0,259230.0,27.0,265316.0,32.0,258194.0,15.0,270629.0,29.0,271727.0
5136,20,459,1.33,35.0,274414.0,30.0,274139.0,18.0,257589.0,35.0,...,25.0,267262.0,22.0,259230.0,27.0,265316.0,32.0,258194.0,15.0,270629.0
5152,29,455,1.33,20.0,269936.0,35.0,274414.0,30.0,274139.0,18.0,...,28.0,278291.0,25.0,267262.0,22.0,259230.0,27.0,265316.0,32.0,258194.0
5168,23,424,1.33,29.0,275508.0,20.0,269936.0,35.0,274414.0,30.0,...,35.0,236159.0,28.0,278291.0,25.0,267262.0,22.0,259230.0,27.0,265316.0


In [9]:
target = 'cases_new'

In [10]:
#train_x, test_x, train_y, test_y = train_test_split(df.drop([target], axis=1), df[target], test_size=0.2, random_state=0)
y_train = df[target]
X_train = df.drop(columns=target)

In [61]:
seed = 1273

In [94]:
def train(model, X, y):
    models = []
    scores = []
    #tscv = TimeSeriesSplit(n_splits=10)
    kf = KFold(n_splits=10)
    
    for train_index, test_index in kf.split(X):
        model_new = clone(model)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model_new.fit(X_train, y_train)
        result = mean_squared_error(y_test, model_new.predict(X_test))**0.5
        
        models.append(model_new)
        scores.append(result)
        
    return models, scores

## 6.3 Decision Tree Regressor

In [103]:
decision_tree = DecisionTreeRegressor()
dt_models, dt_results = train(decision_tree, X_train, y_train)

In [108]:
np.array(dt_results)

array([ 7.74381474,  9.20326029, 13.8552036 , 46.80562644, 30.30401514,
       40.05728656, 11.23878752, 14.05530946, 28.99881091, 19.67406835])

## 6.4 Random Forest

In [130]:
random_forest = RandomForestRegressor(n_estimators=10)
rf_models, rf_results = train(random_forest, X_train, y_train)

In [131]:
np.array(rf_results)

array([ 6.76192773,  6.29102535, 10.87897054, 16.0369781 , 31.36030931,
       27.64238375, 11.70676197, 11.4620062 , 22.71856297, 15.97938327])

## 6.5 BaggingRegressor

In [111]:
bagging = BaggingRegressor()
bagging_models, bagging_results = train(bagging, X_train, y_train)

In [112]:
np.array(bagging_results)

array([ 7.58764346,  5.53133498,  9.78197322, 18.8031469 , 26.48229597,
       26.41445529, 11.54471069,  9.96383114, 22.35770329, 14.28149273])

## 6.6 ExtraTreesRegressor

In [113]:
etr = ExtraTreesRegressor()
etr_models, etr_results = train(etr, X_train, y_train)

In [114]:
np.array(etr_results)

array([ 6.09040831,  5.91046868, 10.46561385, 14.15727375, 30.44593695,
       25.72566926, 10.36218689, 10.92010294, 22.19589617, 13.0552085 ])

## Linear Regression

In [118]:
lr = LinearRegression()
lr_models, lr_results = train(lr, X_train, y_train)

In [120]:
np.array(lr_results)

array([ 5.63023459,  6.23228354, 11.02249386,  8.23768149, 25.84349626,
       28.82823537,  8.61154818, 11.18586362, 21.83067374,  9.0560776 ])

## MLPRegressor

In [146]:
mlpr = MLPRegressor(solver = 'lbfgs')
mlpr_models, mlpr_results = train(mlpr, X_train, y_train)

In [147]:
np.array(mlpr_results)

array([ 136.86999832,  121.07160438, 1276.41444409,  812.61411957,
         82.65364801,  102.75809199,   17.63404112,   31.22411292,
         82.90440034,   68.79227545])

## KNeighborsRegressor

In [133]:
knr = KNeighborsRegressor()
knr_models, knr_results = train(knr, X_train, y_train)

In [134]:
np.array(knr_results)

array([27.17712273, 22.60233026, 18.8625555 , 40.72594587, 43.16561903,
       23.95510168, 14.6130787 , 17.35014658, 32.98965355, 14.10380237])

## SVR

In [137]:
svr = SVR(gamma='auto')
svr_models, svr_results = train(svr, X_train, y_train)

In [138]:
np.array(svr_results)

array([23.60720229, 19.35458602, 12.63240084, 14.4359676 , 45.13053403,
       32.53147547,  7.8650008 , 12.0407321 , 33.7676697 , 15.58731041])

In [24]:
picklestring = pickle.dumps(df)

In [26]:
len(picklestring)

57704