# Training (ML models)

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.linear_model import LinearRegression 
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR

import lightgbm as lgb

import pickle
import time

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_new = pd.read_csv('train_states.csv', index_col=0)

In [4]:
df_new

Unnamed: 0,date,cases_new,state_id,cases_recovered,cases_death,cases_active,checkins,unique_ind,cumul_full,pop,percent_vax,temp
0,2021-02-24,318,2,518,0,6156,2074008,927705,0,3781000,0.00,25.80
1,2021-02-24,17,3,187,0,578,817989,413976,0,2185100,0.00,28.80
2,2021-02-24,53,4,100,0,698,278048,150824,0,1906700,0.00,25.70
3,2021-02-24,26,5,37,0,363,576245,259793,0,932700,0.00,27.50
4,2021-02-24,1392,6,119,0,2208,639575,304428,0,1128800,0.00,27.50
...,...,...,...,...,...,...,...,...,...,...,...,...
4875,2021-12-25,802,11,1151,6,12390,7272213,3243895,4759977,6538000,0.73,28.35
4876,2021-12-25,183,12,111,1,2453,496077,260964,877237,1259300,0.70,27.05
4877,2021-12-25,159,15,326,2,3482,5404399,2379561,3019764,1773700,1.70,28.70
4878,2021-12-25,2,16,10,0,62,71513,28947,79555,99600,0.80,27.70


In [5]:
def prepare_cases_data(state):
    # prepare the columns
    columns = ['cases_new','cases_active','checkins','percent_vax','temp']
    df = df_new[df_new['state_id']==state]
    df = df[columns]
    
    n_days = 5
    
    # averaging the number of cases
    df_cases_new = df['cases_new']
    df['cases_avg'] = 0
    for i in range(n_days):
        df['cases_avg'] += df_cases_new.shift(i+1)
    df['cases_avg'] = df['cases_avg']/n_days
    
    # transform the time series data
    column_arrangement = []
    for i in range(n_days):
        shifted_df = df.shift(i+1)
        df['cn_'+str(i+1)] = shifted_df['cases_new']       
        column_arrangement.append('cn_'+str(i+1))
    
    # averaging the temperature
    df_temp = df['temp']
    df['temp_avg'] = 0
    for i in range(n_days):
        df['temp_avg'] += df_temp.shift(i+1)
    df['temp_avg'] = df['temp_avg']/n_days
    
    # averaging the checkins
    df_checkins = df['checkins']
    df['checkins_avg'] = 0
    for i in range(n_days):
        df['checkins_avg'] += df_checkins.shift(i+1)
    df['checkins_avg'] = df['checkins_avg']/n_days
    
    # averaging the checkins
    df_cases_active = df['cases_active']
    df['cases_active_avg'] = 0
    for i in range(n_days):
        df['cases_active_avg'] += df_cases_active.shift(i+1)
    df['cases_active_avg'] = df['cases_active_avg']/n_days
    
    df = df.dropna()
    
    # remove unused columns
    df = df.drop(columns=['cases_new', 'temp', 'cases_active', 'checkins'])
    
    # rearrange columns
    column_arrangement += ['cases_active_avg', 'checkins_avg', 'temp_avg', 'percent_vax']  
    
    # prepare test-train data
    target = 'cases_avg'    
    y = df[target]
    X = df[column_arrangement]
    
    return X, y

In [6]:
state = 11
X_train, y_train = prepare_cases_data(state)

In [7]:
X_train.iloc[0]

cn_1                    914.00
cn_2                    862.00
cn_3                    633.00
cn_4                    666.00
cn_5                    581.00
cases_active_avg       9313.80
checkins_avg        4751988.00
temp_avg                 28.69
percent_vax               0.00
Name: 91, dtype: float64

In [8]:
def train(model, X, y):
    models = []
    scores = []
    time_train = []    
    time_predict = []
    split_method = TimeSeriesSplit(n_splits=10)
    #split_method = KFold(n_splits=10)
    
    for train_index, test_index in split_method.split(X):
        model_new = clone(model)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        t0 = time.time()
        model_new.fit(X_train, y_train)
        time_train.append(time.time()-t0)
        
        t1 = time.time()
        result = mean_squared_error(y_test, model_new.predict(X_test))**0.5
        time_predict.append(time.time()-t1)
        
        models.append(model_new)
        scores.append(result)
        
    return models, scores, time_train, time_predict

In [9]:
# setting the seed to fix the random output
seed=123

## Decision Tree Regressor

In [10]:
decision_tree = DecisionTreeRegressor()
dt_models, dt_results, dt_time_train, dt_time_predict = train(decision_tree, X_train, y_train)

In [11]:
print('results:', np.array(dt_results),'\n')
print('training time:', np.array(dt_time_train),'\n')
print('predicting time:', np.array(dt_time_predict))

results: [  70.17369983  779.96523474  330.96640761 1111.38663873 1657.05343663
 1650.27075265  516.49577962  244.36710696  193.46211803  152.67273642] 

training time: [0.00399995 0.00100017 0.00099945 0.00200081 0.00199175 0.00200105
 0.002002   0.00199962 0.00199962 0.00199842] 

predicting time: [0.00199962 0.00200009 0.00300193 0.00100112 0.00099993 0.00099802
 0.00199914 0.0010004  0.00200009 0.00300121]


## Random Forest

In [12]:
random_forest = RandomForestRegressor(n_estimators=2000)
rf_models, rf_results, rf_time_train, rf_time_predict = train(random_forest, X_train, y_train)

In [13]:
print('results:', np.array(rf_results),'\n')
print('training time:', np.array(rf_time_train),'\n')
print('predicting time:', np.array(rf_time_predict))

results: [  55.20454026  743.65927218  422.47841585 1097.64779656 2132.43118022
  772.98632698  399.22917863  258.16494742   46.09381712   98.83190277] 

training time: [2.80620766 2.35117483 2.54118896 2.34817505 2.64419603 2.97622228
 3.20123768 3.46325636 3.39025307 3.49025917] 

predicting time: [0.13900995 0.14801121 0.14501023 0.13400984 0.15101147 0.14100957
 0.15301108 0.14501071 0.15501022 0.14501119]


## BaggingRegressor

In [14]:
bagging = BaggingRegressor()
bagging_models, bagging_results, bagging_time_train, bagging_time_predict = train(bagging, X_train, y_train)

In [15]:
print('results:', np.array(bagging_results),'\n')
print('training time:', np.array(bagging_time_train),'\n')
print('predicting time:', np.array(bagging_time_predict))

results: [  60.96144501  744.97363969  374.78389143 1073.71117311 2172.9459758
  705.22840727  283.64640669  280.6079242    57.30739     111.96764063] 

training time: [0.01700354 0.0160017  0.01700115 0.01900077 0.02099991 0.01700115
 0.01800132 0.01900125 0.02100229 0.02100158] 

predicting time: [0.00299716 0.00299883 0.00200009 0.00400019 0.00200033 0.00300026
 0.00200033 0.00200009 0.00299978 0.00400019]


## ExtraTreesRegressor

In [16]:
etr = ExtraTreesRegressor()
etr_models, etr_results, etr_time_train, etr_time_predict = train(etr, X_train, y_train)

In [17]:
print('results:', np.array(etr_results),'\n')
print('training time:', np.array(etr_time_train),'\n')
print('predicting time:', np.array(etr_time_predict))

results: [  54.77807804  732.03870277  416.92458956 1204.17282198 1756.76065599
  607.90828962  368.49035709  144.41315184   73.52503093   85.71945633] 

training time: [0.07000256 0.08500671 0.08100486 0.09100556 0.08800673 0.09900856
 0.10100746 0.10200787 0.12400866 0.12400818] 

predicting time: [0.01000094 0.01000047 0.00800037 0.00900102 0.00800061 0.01099992
 0.00800085 0.01100087 0.01100183 0.0080018 ]


## Linear Regression

In [18]:
lr = LinearRegression()
lr_models, lr_results, lr_time_train, lr_time_predict = train(lr, X_train, y_train)

In [19]:
print('results:', np.array(lr_results),'\n')
print('training time:', np.array(lr_time_train),'\n')
print('predicting time:', np.array(lr_time_predict))

results: [2.29964475e-11 8.95970497e-11 1.06572533e-10 4.35395577e-11
 1.94057113e-11 5.79656390e-11 7.66886487e-11 1.54122175e-09
 2.45346066e-10 5.70481371e-11] 

training time: [0.00700116 0.00199938 0.00100017 0.00199962 0.00099921 0.00200176
 0.0010004  0.00100017 0.00200033 0.00100017] 

predicting time: [0.00200009 0.00200033 0.00200248 0.00099921 0.00199986 0.00099874
 0.00200081 0.00199842 0.00100017 0.00200272]


## MLPRegressor

In [20]:
mlpr = MLPRegressor(hidden_layer_sizes=(50,10),solver = 'lbfgs')
mlpr_models, mlpr_results, mlpr_time_train, mlpr_time_predict = train(mlpr, X_train, y_train)

In [21]:
print('results:', np.array(mlpr_results),'\n')
print('training time:', np.array(mlpr_time_train),'\n')
print('predicting time:', np.array(mlpr_time_predict))

results: [  17.79269506  183.2899848    35.66337973  499.51298643   38.90639779
  103.73795864   44.84765847 1487.78415818    7.33067832   84.02747529] 

training time: [0.06489277 0.10600805 0.10400724 0.05600595 0.14101076 0.12801123
 0.10600781 0.01099944 0.17101288 0.13800955] 

predicting time: [0.00199842 0.00200033 0.00200319 0.0009985  0.00200033 0.00199819
 0.00200176 0.00200009 0.00200057 0.00100017]


## KNeighborsRegressor

In [22]:
knr = KNeighborsRegressor()
knr_models, knr_results, knr_time_train, knr_time_predict = train(knr, X_train, y_train)

In [23]:
print('results:', np.array(knr_results),'\n')
print('training time:', np.array(knr_time_train),'\n')
print('predicting time:', np.array(knr_time_predict))

results: [ 105.3002052   730.55926765  682.59642612 1176.00618799 4515.87618202
 3377.14259217 1770.1339271   136.17980837  269.89240589  220.87722697] 

training time: [0.00200248 0.00100064 0.00099993 0.0019989  0.00099897 0.00200009
 0.00200295 0.00100183 0.00199938 0.00100064] 

predicting time: [0.00299883 0.00199866 0.00300217 0.00200009 0.00200009 0.0010004
 0.00199819 0.00199866 0.00099874 0.00199962]


## SVR

In [24]:
svr = SVR(gamma='auto')
svr_models, svr_results, svr_time_train, svr_time_predict = train(svr, X_train, y_train)

In [25]:
print('results:', np.array(svr_results),'\n')
print('training time:', np.array(svr_time_train),'\n')
print('predicting time:', np.array(svr_time_predict))

results: [ 186.27848825  900.06770033 1575.05189039 2305.24813961 5370.5362219
 4012.32356474  548.9081769   819.93109196  556.97383142  406.28661111] 

training time: [0.00299859 0.00200057 0.00199842 0.00199962 0.0039959  0.00499988
 0.00600028 0.00500059 0.00800014 0.00700164] 

predicting time: [0.00300097 0.00200009 0.00200033 0.00200105 0.00200129 0.00200057
 0.00200033 0.00200009 0.00200009 0.00199914]


## Result Summary

In [26]:
df_result = pd.DataFrame(columns=[1,2,3,4,5,6,7,8,9,10])
df_result.loc['Decision Tree Regressor'] = dt_results
df_result.loc['Random Forest'] = rf_results
df_result.loc['BaggingRegressor'] = bagging_results
df_result.loc['ExtraTreesRegressor'] = etr_results
df_result.loc['Linear Regression'] = lr_results
df_result.loc['MLPRegressor'] = mlpr_results
df_result.loc['KNeighborsRegressor'] = knr_results
df_result.loc['SVR'] = svr_results

df_result.to_csv('cases_results.csv')

df_result

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Decision Tree Regressor,70.1737,779.9652,330.9664,1111.387,1657.053,1650.271,516.4958,244.3671,193.4621,152.6727
Random Forest,55.20454,743.6593,422.4784,1097.648,2132.431,772.9863,399.2292,258.1649,46.09382,98.8319
BaggingRegressor,60.96145,744.9736,374.7839,1073.711,2172.946,705.2284,283.6464,280.6079,57.30739,111.9676
ExtraTreesRegressor,54.77808,732.0387,416.9246,1204.173,1756.761,607.9083,368.4904,144.4132,73.52503,85.71946
Linear Regression,2.299645e-11,8.959705e-11,1.065725e-10,4.353956e-11,1.940571e-11,5.796564e-11,7.668865e-11,1.541222e-09,2.453461e-10,5.704814e-11
MLPRegressor,17.7927,183.29,35.66338,499.513,38.9064,103.738,44.84766,1487.784,7.330678,84.02748
KNeighborsRegressor,105.3002,730.5593,682.5964,1176.006,4515.876,3377.143,1770.134,136.1798,269.8924,220.8772
SVR,186.2785,900.0677,1575.052,2305.248,5370.536,4012.324,548.9082,819.9311,556.9738,406.2866


In [27]:
df_time_train = pd.DataFrame(columns=[1,2,3,4,5,6,7,8,9,10])
df_time_train.loc['Decision Tree Regressor'] = dt_time_train
df_time_train.loc['Random Forest'] = rf_time_train
df_time_train.loc['BaggingRegressor'] = bagging_time_train
df_time_train.loc['ExtraTreesRegressor'] = etr_time_train
df_time_train.loc['Linear Regression'] = lr_time_train
df_time_train.loc['MLPRegressor'] = mlpr_time_train
df_time_train.loc['KNeighborsRegressor'] = knr_time_train
df_time_train.loc['SVR'] = svr_time_train

df_time_train.to_csv('cases_train_time.csv')

df_time_train

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Decision Tree Regressor,0.004,0.001,0.000999,0.002001,0.001992,0.002001,0.002002,0.002,0.002,0.001998
Random Forest,2.806208,2.351175,2.541189,2.348175,2.644196,2.976222,3.201238,3.463256,3.390253,3.490259
BaggingRegressor,0.017004,0.016002,0.017001,0.019001,0.021,0.017001,0.018001,0.019001,0.021002,0.021002
ExtraTreesRegressor,0.070003,0.085007,0.081005,0.091006,0.088007,0.099009,0.101007,0.102008,0.124009,0.124008
Linear Regression,0.007001,0.001999,0.001,0.002,0.000999,0.002002,0.001,0.001,0.002,0.001
MLPRegressor,0.064893,0.106008,0.104007,0.056006,0.141011,0.128011,0.106008,0.010999,0.171013,0.13801
KNeighborsRegressor,0.002002,0.001001,0.001,0.001999,0.000999,0.002,0.002003,0.001002,0.001999,0.001001
SVR,0.002999,0.002001,0.001998,0.002,0.003996,0.005,0.006,0.005001,0.008,0.007002


In [28]:
df_time_predict = pd.DataFrame(columns=[1,2,3,4,5,6,7,8,9,10])
df_time_predict.loc['Decision Tree Regressor'] = dt_time_predict
df_time_predict.loc['Random Forest'] = rf_time_predict
df_time_predict.loc['BaggingRegressor'] = bagging_time_predict
df_time_predict.loc['ExtraTreesRegressor'] = etr_time_predict
df_time_predict.loc['Linear Regression'] = lr_time_predict
df_time_predict.loc['MLPRegressor'] = mlpr_time_predict
df_time_predict.loc['KNeighborsRegressor'] = knr_time_predict
df_time_predict.loc['SVR'] = svr_time_predict

df_time_predict.to_csv('cases_predict_time.csv')

df_time_predict

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Decision Tree Regressor,0.002,0.002,0.003002,0.001001,0.001,0.000998,0.001999,0.001,0.002,0.003001
Random Forest,0.13901,0.148011,0.14501,0.13401,0.151011,0.14101,0.153011,0.145011,0.15501,0.145011
BaggingRegressor,0.002997,0.002999,0.002,0.004,0.002,0.003,0.002,0.002,0.003,0.004
ExtraTreesRegressor,0.010001,0.01,0.008,0.009001,0.008001,0.011,0.008001,0.011001,0.011002,0.008002
Linear Regression,0.002,0.002,0.002002,0.000999,0.002,0.000999,0.002001,0.001998,0.001,0.002003
MLPRegressor,0.001998,0.002,0.002003,0.000998,0.002,0.001998,0.002002,0.002,0.002001,0.001
KNeighborsRegressor,0.002999,0.001999,0.003002,0.002,0.002,0.001,0.001998,0.001999,0.000999,0.002
SVR,0.003001,0.002,0.002,0.002001,0.002001,0.002001,0.002,0.002,0.002,0.001999
