# Training (ML models)

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.linear_model import LinearRegression 
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR

import lightgbm as lgb

import pickle

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_new = pd.read_csv('train_states.csv', index_col=0)

In [4]:
df_new

Unnamed: 0,date,cases_new,state_id,cases_recovered,cases_death,cases_active,checkins,unique_ind,cumul_full,pop,percent_vax,temp
0,2021-02-24,318,2,518,0,6156,2074008,927705,0,3781000,0.00,25.80
1,2021-02-24,17,3,187,0,578,817989,413976,0,2185100,0.00,28.80
2,2021-02-24,53,4,100,0,698,278048,150824,0,1906700,0.00,25.70
3,2021-02-24,26,5,37,0,363,576245,259793,0,932700,0.00,27.50
4,2021-02-24,1392,6,119,0,2208,639575,304428,0,1128800,0.00,27.50
...,...,...,...,...,...,...,...,...,...,...,...,...
4875,2021-12-25,802,11,1151,6,12390,7272213,3243895,4759977,6538000,0.73,28.35
4876,2021-12-25,183,12,111,1,2453,496077,260964,877237,1259300,0.70,27.05
4877,2021-12-25,159,15,326,2,3482,5404399,2379561,3019764,1773700,1.70,28.70
4878,2021-12-25,2,16,10,0,62,71513,28947,79555,99600,0.80,27.70


In [5]:
def prepare_cases_data(state):
    # prepare the columns
    columns = ['cases_new','cases_active','checkins','percent_vax','temp']
    df = df_new[df_new['state_id']==state]
    df = df[columns]
    
    # transform the time series data
    n_days = 5
    column_arrangement = []
    for i in range(n_days):
        shifted_df = df.shift(i+1)
        df['cn_'+str(i+1)] = shifted_df['cases_new']        
        #df['ca_'+str(i+1)] = shifted_df['cases_active']
        column_arrangement.append('cn_'+str(i+1))
    
    # averaging the number of cases
    df_cases_new = df['cases_new']
    df['cases_avg'] = 0
    for i in range(n_days):
        df['cases_avg'] += df_cases_new.shift(i+1)
    df['cases_avg'] = df['cases_avg']/n_days
    
    # averaging the temperature
    df_temp = df['temp']
    df['temp_avg'] = 0
    for i in range(n_days):
        df['temp_avg'] += df_temp.shift(i+1)
    df['temp_avg'] = df['temp_avg']/n_days
    
    # averaging the checkins
    df_checkins = df['checkins']
    df['checkins_avg'] = 0
    for i in range(n_days):
        df['checkins_avg'] += df_checkins.shift(i+1)
    df['checkins_avg'] = df['checkins_avg']/n_days
    
    # averaging the checkins
    df_cases_active = df['cases_active']
    df['cases_active_avg'] = 0
    for i in range(n_days):
        df['cases_active_avg'] += df_cases_active.shift(i+1)
    df['cases_active_avg'] = df['cases_active_avg']/n_days
    
    df = df.dropna()
    
    # remove unused columns
    df = df.drop(columns=['cases_new', 'temp', 'cases_active', 'checkins'])
    
    # rearrange columns
    column_arrangement += ['cases_active_avg', 'checkins_avg', 'temp_avg', 'percent_vax']  
    
    # prepare test-train data
    target = 'cases_avg'    
    y = df[target]
    X = df[column_arrangement]
    
    return X, y

In [6]:
def prepare_death_data(state):
    # prepare the columns
    columns = ['cases_new','cases_active','cases_death']
    df = df_new[df_new['state_id']==state]
    df = df[columns]
    
    # transform the time series data
    n_days = 5
    column_arrangement = []
    for i in range(n_days):
        shifted_df = df.shift(i+1)
        df['cn_'+str(i+1)] = shifted_df['cases_new']      
        column_arrangement.append('cn_'+str(i+1))
    
    # averaging the number of cases
    df_cases_new = df['cases_new']
    df['cases_avg'] = 0
    for i in range(n_days):
        df['cases_avg'] += df_cases_new.shift(i+1)
    df['cases_avg'] = df['cases_avg']/n_days
    
    # averaging the temperature
    df_temp = df['cases_death']
    df['cases_death_avg'] = 0
    for i in range(n_days):
        df['cases_death_avg'] += df_temp.shift(i+1)
    df['cases_death_avg'] = df['cases_death_avg']/n_days
    
    # averaging the active cases
    df_cases_active = df['cases_active']
    df['cases_active_avg'] = 0
    for i in range(n_days):
        df['cases_active_avg'] += df_cases_active.shift(i+1)
    df['cases_active_avg'] = df['cases_active_avg']/n_days
    
    df = df.dropna()
    
    # remove unused columns
    df = df.drop(columns=['cases_new', 'cases_death', 'cases_active'])
    
    # rearrange columns
    column_arrangement += ['cases_active_avg']  
    
    # prepare test-train data
    target = 'cases_death_avg'    
    y = df[target]
    X = df[column_arrangement]
    
    return X, y

In [7]:
prepare_death_data(2)

(       cn_1   cn_2   cn_3   cn_4   cn_5  cases_active_avg
 80    340.0  314.0  276.0  257.0  318.0            5211.8
 96    490.0  340.0  314.0  276.0  257.0            4924.2
 112   163.0  490.0  340.0  314.0  276.0            4691.8
 128   376.0  163.0  490.0  340.0  314.0            4544.8
 144   255.0  376.0  163.0  490.0  340.0            4474.2
 ...     ...    ...    ...    ...    ...               ...
 4800  355.0  297.0  407.0  399.0  471.0            6383.8
 4816  378.0  355.0  297.0  407.0  399.0            6334.8
 4832  467.0  378.0  355.0  297.0  407.0            6293.6
 4848  398.0  467.0  378.0  355.0  297.0            6234.6
 4864  396.0  398.0  467.0  378.0  355.0            6163.8
 
 [300 rows x 6 columns],
 80      0.6
 96      0.6
 112     0.6
 128     0.6
 144     0.4
        ... 
 4800    3.2
 4816    2.8
 4832    3.2
 4848    3.2
 4864    3.6
 Name: cases_death_avg, Length: 300, dtype: float64)

In [8]:
def train(model, X, y):
    models = []
    scores = []
    split_method = TimeSeriesSplit(n_splits=10)
    #split_method = KFold(n_splits=10)
    
    for train_index, test_index in split_method.split(X):
        model_new = clone(model)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model_new.fit(X_train, y_train)
        result = mean_squared_error(y_test, model_new.predict(X_test))**0.5
        
        models.append(model_new)
        scores.append(result)
        
    return models, scores

## Train for all states

In [9]:
#regressor = RandomForestRegressor(n_estimators=100)
#regressor = LinearRegression()
regressor = MLPRegressor(hidden_layer_sizes=(50,10),solver = 'lbfgs')


In [10]:
models = {'cases':{}, 'death':{}}

for state in range(2,18):
    # get the data for each state
    X, y = prepare_cases_data(state)
    
    # train some models
    state_models, state_results = train(regressor, X, y)
    
    # getting the model with the lowest RSME for each state
    i = state_results.index(min(state_results))
    best_state_model = state_models[i]
    models['cases'][state] = best_state_model

    
for state in range(2,18):
    # get the data for each state
    X, y = prepare_death_data(state)
    
    # train some models
    state_models, state_results = train(regressor, X, y)
    
    # getting the model with the lowest RSME for each state
    i = state_results.index(min(state_results))
    best_state_model = state_models[i]
    models['death'][state] = best_state_model

In [11]:
models

{'cases': {2: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  3: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  4: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  5: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  6: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  7: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  8: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  9: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  10: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  11: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  12: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  13: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  14: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  15: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  16: MLPRegressor(hidden_layer_sizes=(50, 10), solver='lbfgs'),
  17: MLPRegressor(hidde

## Pack the model

In [12]:
model_file = "covid_cases_regressor.pickle"
with open(model_file, 'wb') as f:
    pickle.dump(models, f)