# Training (ML models)

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.linear_model import LinearRegression 
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR

import lightgbm as lgb

import pickle
import time

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_new = pd.read_csv('train_states.csv', index_col=0)

In [4]:
df_new

Unnamed: 0,date,cases_new,state_id,cases_recovered,cases_death,cases_active,checkins,unique_ind,cumul_full,pop,percent_vax,temp
0,2021-02-24,318,2,518,0,6156,2074008,927705,0,3781000,0.00,25.80
1,2021-02-24,17,3,187,0,578,817989,413976,0,2185100,0.00,28.80
2,2021-02-24,53,4,100,0,698,278048,150824,0,1906700,0.00,25.70
3,2021-02-24,26,5,37,0,363,576245,259793,0,932700,0.00,27.50
4,2021-02-24,1392,6,119,0,2208,639575,304428,0,1128800,0.00,27.50
...,...,...,...,...,...,...,...,...,...,...,...,...
4875,2021-12-25,802,11,1151,6,12390,7272213,3243895,4759977,6538000,0.73,28.35
4876,2021-12-25,183,12,111,1,2453,496077,260964,877237,1259300,0.70,27.05
4877,2021-12-25,159,15,326,2,3482,5404399,2379561,3019764,1773700,1.70,28.70
4878,2021-12-25,2,16,10,0,62,71513,28947,79555,99600,0.80,27.70


In [5]:
def prepare_death_data(state):
    # prepare the columns
    columns = ['cases_new','cases_active','cases_death']
    df = df_new[df_new['state_id']==state]
    df = df[columns]
    
    # transform the time series data
    n_days = 5
    column_arrangement = []
    for i in range(n_days):
        shifted_df = df.shift(i+1)
        df['cn_'+str(i+1)] = shifted_df['cases_new']      
        column_arrangement.append('cn_'+str(i+1))
    
    # averaging the number of cases
    df_cases_new = df['cases_new']
    df['cases_avg'] = 0
    for i in range(n_days):
        df['cases_avg'] += df_cases_new.shift(i+1)
    df['cases_avg'] = df['cases_avg']/n_days
    
    # averaging the temperature
    df_temp = df['cases_death']
    df['cases_death_avg'] = 0
    for i in range(n_days):
        df['cases_death_avg'] += df_temp.shift(i+1)
    df['cases_death_avg'] = df['cases_death_avg']/n_days
    
    # averaging the active cases
    df_cases_active = df['cases_active']
    df['cases_active_avg'] = 0
    for i in range(n_days):
        df['cases_active_avg'] += df_cases_active.shift(i+1)
    df['cases_active_avg'] = df['cases_active_avg']/n_days
    
    df = df.dropna()
    
    # remove unused columns
    df = df.drop(columns=['cases_new', 'cases_death', 'cases_active'])
    
    # rearrange columns
    column_arrangement += ['cases_active_avg']  
    
    # prepare test-train data
    target = 'cases_death_avg'    
    y = df[target]
    X = df[column_arrangement]
    
    return X, y

In [6]:
state = 11
X_train, y_train = prepare_death_data(state)

In [7]:
X_train.iloc[0]

cn_1                 914.0
cn_2                 862.0
cn_3                 633.0
cn_4                 666.0
cn_5                 581.0
cases_active_avg    9313.8
Name: 91, dtype: float64

In [8]:
def train(model, X, y):
    models = []
    scores = []
    time_train = []    
    time_predict = []
    split_method = TimeSeriesSplit(n_splits=10)
    #split_method = KFold(n_splits=10)
    
    for train_index, test_index in split_method.split(X):
        model_new = clone(model)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        t0 = time.time()
        model_new.fit(X_train, y_train)
        time_train.append(time.time()-t0)
        
        t1 = time.time()
        result = mean_squared_error(y_test, model_new.predict(X_test))**0.5
        time_predict.append(time.time()-t1)
        
        models.append(model_new)
        scores.append(result)
        
    return models, scores, time_train, time_predict

In [9]:
# setting the seed to fix the random output
seed=123

## 6.3 Decision Tree Regressor

In [10]:
decision_tree = DecisionTreeRegressor()
dt_models, dt_results, dt_time_train, dt_time_predict = train(decision_tree, X_train, y_train)

In [11]:
print('results:', np.array(dt_results),'\n')
print('training time:', np.array(dt_time_train),'\n')
print('predicting time:', np.array(dt_time_predict))

results: [ 1.0139764   6.5511096   7.4186951  14.00883319 26.80876032 37.92903705
 85.72417439  6.91461148  5.15119547  2.59458125] 

training time: [0.00200033 0.00199986 0.00200248 0.00199914 0.00200009 0.00200248
 0.00099993 0.00199914 0.00200009 0.00300026] 

predicting time: [0.00200033 0.00299931 0.00199866 0.00200081 0.00099969 0.00199652
 0.00200152 0.00200009 0.00200009 0.00100017]


## 6.4 Random Forest

In [12]:
random_forest = RandomForestRegressor(n_estimators=1000)
rf_models, rf_results, rf_time_train, rf_time_predict = train(random_forest, X_train, y_train)

In [13]:
print('results:', np.array(rf_results),'\n')
print('training time:', np.array(rf_time_train),'\n')
print('predicting time:', np.array(rf_time_predict))

results: [ 0.94599162  6.56401003  7.86847896 13.61044751 27.5586345  37.93636231
 82.63351812 13.04394162  4.45686413  2.29111448] 

training time: [1.28945446 1.06207895 1.10108161 1.13908505 1.20708871 1.28509593
 1.34109926 1.39610338 1.47110963 1.55311251] 

predicting time: [0.07400584 0.0710063  0.06900501 0.07100534 0.07100511 0.07100558
 0.07500601 0.06900525 0.06900597 0.07300496]


## 6.5 BaggingRegressor

In [14]:
bagging = BaggingRegressor()
bagging_models, bagging_results, bagging_time_train, bagging_time_predict = train(bagging, X_train, y_train)

In [15]:
print('results:', np.array(bagging_results),'\n')
print('training time:', np.array(bagging_time_train),'\n')
print('predicting time:', np.array(bagging_time_predict))

results: [ 0.96460316  6.4828126   7.90365316 13.31640982 27.56464402 38.85734899
 86.20920566 10.55840967  4.10637445  2.49948735] 

training time: [0.01900053 0.01399994 0.01499987 0.01500106 0.02100134 0.0160017
 0.01800132 0.01800084 0.02100205 0.0220015 ] 

predicting time: [0.00299931 0.0030005  0.00200057 0.00399995 0.00200033 0.0030005
 0.00399995 0.00300026 0.00200009 0.00200129]


## 6.6 ExtraTreesRegressor

In [16]:
etr = ExtraTreesRegressor()
etr_models, etr_results, etr_time_train, etr_time_predict = train(etr, X_train, y_train)

In [17]:
print('results:', np.array(etr_results),'\n')
print('training time:', np.array(etr_time_train),'\n')
print('predicting time:', np.array(etr_time_predict))

results: [ 0.86428502  6.49735938  7.31559765 12.17750077 27.2113162  38.70369654
 80.94272893  7.56382857  4.78878055  2.68597364] 

training time: [0.07300282 0.07700562 0.07800913 0.0790062  0.0870049  0.08700562
 0.09600711 0.09900928 0.09800458 0.10400915] 

predicting time: [0.01100063 0.00800228 0.00899816 0.00900149 0.01000047 0.00800061
 0.01000166 0.00900149 0.00800085 0.00899935]


## Linear Regression

In [18]:
lr = LinearRegression()
lr_models, lr_results, lr_time_train, lr_time_predict = train(lr, X_train, y_train)

In [19]:
print('results:', np.array(lr_results),'\n')
print('training time:', np.array(lr_time_train),'\n')
print('predicting time:', np.array(lr_time_predict))

results: [ 1.03455131  4.644456    3.12049416  9.50582916 27.09458108 37.09515872
 78.02176287 19.84608547 15.843691   19.43218753] 

training time: [0.00199747 0.00199962 0.00200081 0.00199866 0.00100064 0.00099826
 0.00100017 0.00200033 0.00099993 0.00099969] 

predicting time: [0.00200248 0.00099945 0.00100017 0.00099969 0.00200272 0.00099993
 0.00200009 0.00100017 0.00100017 0.0010004 ]


## MLPRegressor

In [20]:
mlpr = MLPRegressor(hidden_layer_sizes=(100,50),solver = 'lbfgs')
mlpr_models, mlpr_results, mlpr_time_train, mlpr_time_predict = train(mlpr, X_train, y_train)

In [21]:
print('results:', np.array(mlpr_results),'\n')
print('training time:', np.array(mlpr_time_train),'\n')
print('predicting time:', np.array(mlpr_time_predict))

results: [ 1.78689914  7.3665087  14.48049617 12.38267864 16.17559263 35.99744396
 76.1341552  11.7012398   8.71340219 16.0415673 ] 

training time: [0.08700562 0.21701574 0.27002096 0.12100983 0.26101971 0.29702306
 0.34802866 0.25002027 0.37702918 0.35302854] 

predicting time: [0.00099993 0.00100112 0.00199866 0.00200009 0.00099874 0.00199914
 0.0009973  0.00199866 0.00099921 0.00099826]


## KNeighborsRegressor

In [22]:
knr = KNeighborsRegressor()
knr_models, knr_results, knr_time_train, knr_time_predict = train(knr, X_train, y_train)

In [23]:
print('results:', np.array(knr_results),'\n')
print('training time:', np.array(knr_time_train),'\n')
print('predicting time:', np.array(knr_time_predict))

results: [ 0.9381779   7.06249772  7.16428591 12.52009703 28.21855129 33.92362926
 81.24478677  7.48557526  4.61425167  2.80847395] 

training time: [0.00200081 0.00099993 0.00100231 0.00200224 0.00100255 0.00099874
 0.002002   0.0010004  0.00099993 0.00100017] 

predicting time: [0.00199747 0.00199986 0.00299931 0.00199842 0.00199652 0.00200033
 0.00199771 0.00199842 0.00200009 0.00200224]


## SVR

In [24]:
svr = SVR(gamma='auto')
svr_models, svr_results, svr_time_train, svr_time_predict = train(svr, X_train, y_train)

In [25]:
print('results:', np.array(svr_results),'\n')
print('training time:', np.array(svr_time_train),'\n')
print('predicting time:', np.array(svr_time_predict))

results: [ 0.47762144  7.93014774 21.19302646 32.4377181  64.65618472 80.92927826
 95.4789565  18.76304375 15.77546149 11.04770799] 

training time: [0.00200057 0.002002   0.00200248 0.00199914 0.00399852 0.00400233
 0.00399876 0.0050025  0.00700021 0.00600076] 

predicting time: [0.00299859 0.00200009 0.00199795 0.00200152 0.00200057 0.00199699
 0.00300074 0.00299764 0.00200009 0.00300026]


## Result Summary

In [26]:
df_result = pd.DataFrame(columns=[1,2,3,4,5,6,7,8,9,10])
df_result.loc['Decision Tree Regressor'] = dt_results
df_result.loc['Random Forest'] = rf_results
df_result.loc['BaggingRegressor'] = bagging_results
df_result.loc['ExtraTreesRegressor'] = etr_results
df_result.loc['Linear Regression'] = lr_results
df_result.loc['MLPRegressor'] = mlpr_results
df_result.loc['KNeighborsRegressor'] = knr_results
df_result.loc['SVR'] = svr_results

df_result.to_csv('death_results.csv')

df_result

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Decision Tree Regressor,1.013976,6.55111,7.418695,14.008833,26.80876,37.929037,85.724174,6.914611,5.151195,2.594581
Random Forest,0.945992,6.56401,7.868479,13.610448,27.558635,37.936362,82.633518,13.043942,4.456864,2.291114
BaggingRegressor,0.964603,6.482813,7.903653,13.31641,27.564644,38.857349,86.209206,10.55841,4.106374,2.499487
ExtraTreesRegressor,0.864285,6.497359,7.315598,12.177501,27.211316,38.703697,80.942729,7.563829,4.788781,2.685974
Linear Regression,1.034551,4.644456,3.120494,9.505829,27.094581,37.095159,78.021763,19.846085,15.843691,19.432188
MLPRegressor,1.786899,7.366509,14.480496,12.382679,16.175593,35.997444,76.134155,11.70124,8.713402,16.041567
KNeighborsRegressor,0.938178,7.062498,7.164286,12.520097,28.218551,33.923629,81.244787,7.485575,4.614252,2.808474
SVR,0.477621,7.930148,21.193026,32.437718,64.656185,80.929278,95.478956,18.763044,15.775461,11.047708


In [27]:
df_time_train = pd.DataFrame(columns=[1,2,3,4,5,6,7,8,9,10])
df_time_train.loc['Decision Tree Regressor'] = dt_time_train
df_time_train.loc['Random Forest'] = rf_time_train
df_time_train.loc['BaggingRegressor'] = bagging_time_train
df_time_train.loc['ExtraTreesRegressor'] = etr_time_train
df_time_train.loc['Linear Regression'] = lr_time_train
df_time_train.loc['MLPRegressor'] = mlpr_time_train
df_time_train.loc['KNeighborsRegressor'] = knr_time_train
df_time_train.loc['SVR'] = svr_time_train

df_time_train.to_csv('death_train_time.csv')


df_time_train

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Decision Tree Regressor,0.002,0.002,0.002002,0.001999,0.002,0.002002,0.001,0.001999,0.002,0.003
Random Forest,1.289454,1.062079,1.101082,1.139085,1.207089,1.285096,1.341099,1.396103,1.47111,1.553113
BaggingRegressor,0.019001,0.014,0.015,0.015001,0.021001,0.016002,0.018001,0.018001,0.021002,0.022002
ExtraTreesRegressor,0.073003,0.077006,0.078009,0.079006,0.087005,0.087006,0.096007,0.099009,0.098005,0.104009
Linear Regression,0.001997,0.002,0.002001,0.001999,0.001001,0.000998,0.001,0.002,0.001,0.001
MLPRegressor,0.087006,0.217016,0.270021,0.12101,0.26102,0.297023,0.348029,0.25002,0.377029,0.353029
KNeighborsRegressor,0.002001,0.001,0.001002,0.002002,0.001003,0.000999,0.002002,0.001,0.001,0.001
SVR,0.002001,0.002002,0.002002,0.001999,0.003999,0.004002,0.003999,0.005002,0.007,0.006001


In [28]:
df_time_predict = pd.DataFrame(columns=[1,2,3,4,5,6,7,8,9,10])
df_time_predict.loc['Decision Tree Regressor'] = dt_time_predict
df_time_predict.loc['Random Forest'] = rf_time_predict
df_time_predict.loc['BaggingRegressor'] = bagging_time_predict
df_time_predict.loc['ExtraTreesRegressor'] = etr_time_predict
df_time_predict.loc['Linear Regression'] = lr_time_predict
df_time_predict.loc['MLPRegressor'] = mlpr_time_predict
df_time_predict.loc['KNeighborsRegressor'] = knr_time_predict
df_time_predict.loc['SVR'] = svr_time_predict

df_time_predict.to_csv('death_predict_time.csv')

df_time_predict

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Decision Tree Regressor,0.002,0.002999,0.001999,0.002001,0.001,0.001997,0.002002,0.002,0.002,0.001
Random Forest,0.074006,0.071006,0.069005,0.071005,0.071005,0.071006,0.075006,0.069005,0.069006,0.073005
BaggingRegressor,0.002999,0.003,0.002001,0.004,0.002,0.003,0.004,0.003,0.002,0.002001
ExtraTreesRegressor,0.011001,0.008002,0.008998,0.009001,0.01,0.008001,0.010002,0.009001,0.008001,0.008999
Linear Regression,0.002002,0.000999,0.001,0.001,0.002003,0.001,0.002,0.001,0.001,0.001
MLPRegressor,0.001,0.001001,0.001999,0.002,0.000999,0.001999,0.000997,0.001999,0.000999,0.000998
KNeighborsRegressor,0.001997,0.002,0.002999,0.001998,0.001997,0.002,0.001998,0.001998,0.002,0.002002
SVR,0.002999,0.002,0.001998,0.002002,0.002001,0.001997,0.003001,0.002998,0.002,0.003
