In [None]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as scl
from sklearn import tree
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from IPython.display import clear_output

plt.rcParams['figure.dpi'] = 300
%config InlineBackend.figure_format = 'retina'

In [None]:
def extract_stat_country(COVID_data,country_name,stat_name,N_stat,type_country):
    N_country   = len(country_name)
    stat_COVID_countries = np.empty([N_country,N_stat])

    for ind_country in range(N_country):
        tmp_COVID_country  = COVID_data.loc[COVID_data[type_country]
                                            ==country_name[ind_country]]
        
        date_country       = tmp_COVID_country['date']
        ind_date = pd.to_datetime(date_country,format='%Y-%m-%d')>=start_time

        tmp_stat_COVID_country = tmp_COVID_country[stat_name]
        stat_COVID_countries[ind_country,:] = tmp_stat_COVID_country[ind_date]
        
    return stat_COVID_countries

## 1. Read the COVID19 data

In [None]:
dir_name  = "./DATA/"
file_name = "covid-data.csv"
path_name = dir_name+file_name

COVID_data = pd.read_csv(path_name,encoding='euc-kr')

In [None]:
COVID_data

## 2. Select Countries [Training & Validation]

In [None]:
# toal_country        = ["United Kingdom","Germany","France","Spain","Italy",
#                        "Netherlands","Belgium","Greece","Romania","Austria"]
# train_country       = ["United Kingdom","Germany","Italy","Netherlands","Belgium"]
# validation_country  = ["Romania"]

toal_country        = ["United States","India","Russia","Brazil","France",
                       "United Kingdom","Italy","Spain","Germany","Japan"]
train_country        = ["United States","India","Russia","Brazil","France"]
validation_country  = ["Japan"]

statistic_type      = ["new_cases","new_deaths"]

## 3. Extract Statistics to 1 Total Array

In [None]:
start_time = pd.to_datetime('2020-03-01',format='%Y-%m-%d')

if  (statistic_type[0] == "newConfirmed"):
    type_country       = "Country"
    tmp_COVID_country  = COVID_data.loc[COVID_data[type_country]==
                                        toal_country[0]]
    date_country       = tmp_COVID_country['Date']
    
elif (statistic_type[0] == "new_cases"):
    type_country       = "location"
    tmp_COVID_country  = COVID_data.loc[COVID_data[type_country]==
                                        toal_country[0]]
    date_country       = tmp_COVID_country['date']
    
ind_date = pd.to_datetime(date_country,format='%Y-%m-%d')>=start_time
COVID_country = tmp_COVID_country[ind_date][:]
stat_COVID_country = COVID_country[statistic_type[0]]


N_country   = len(train_country)  ; N_stat      = len(stat_COVID_country)

total_stat_1_COVID_countries  = extract_stat_country(COVID_data,toal_country,
                                                 statistic_type[0],N_stat,type_country)
total_stat_2_COVID_countries  = extract_stat_country(COVID_data,toal_country,
                                                 statistic_type[1],N_stat,type_country)

train_stat_1_COVID_countries  = extract_stat_country(COVID_data,train_country,
                                                 statistic_type[0],N_stat,type_country)
train_stat_2_COVID_countries  = extract_stat_country(COVID_data,train_country,
                                                 statistic_type[1],N_stat,type_country)

validation_stat_1_COVID_countries  = extract_stat_country(COVID_data,validation_country,
                                                 statistic_type[0],N_stat,type_country)
validation_stat_2_COVID_countries  = extract_stat_country(COVID_data,validation_country,
                                                 statistic_type[1],N_stat,type_country)

## 4. Time Series of new Confirmed cases for training countries

In [None]:
date[N_ratio]

In [None]:
N_point = 10 

N_date = len(date_country)
date = date_country[ind_date].values
date_name = date[1:N_date:int(N_date/N_point)]

fig, axes = plt.subplots(2,1,figsize=(10,7),constrained_layout=True)

for ind_country in range(N_country):
    tmp_stat_1 = train_stat_1_COVID_countries[ind_country,:]
    tmp_stat_2 = train_stat_2_COVID_countries[ind_country,:]

    axes[0].plot(date,tmp_stat_1,
                 '.',linewidth=1,color=color_name[ind_country],
                 label=train_country[ind_country],alpha=0.3)
    
    axes[1].plot(date,tmp_stat_1/np.nanmax(tmp_stat_1),
                 '.',linewidth=1,color=color_name[ind_country],
                 label=train_country[ind_country],alpha=0.3)

axes[0].set_title(statistic_type[0]+" Log Scale",fontsize=15)
axes[1].set_title(statistic_type[0]+" Linear Scale",fontsize=15)
axes[0].set_yscale("log")

for ind_axes in range(2):
    axes[ind_axes].set_xticks(date_name)
    axes[ind_axes].set_xticklabels(date_name,rotation=45)
    axes[ind_axes].legend()

axes[1].arrow( date[160+20], -0.3, 140, 0, head_width = 0.07, head_length = 5.5, 
           linewidth = 2., alpha = 0.8, color = 'red', length_includes_head = True)
axes[1].arrow( date[160-20], -0.3, -140, 0,head_width = 0.07, head_length = 5.5, 
           linewidth = 2., alpha = 0.8, color = 'red', length_includes_head = True)
axes[1].text( date[160], -0.3, 'Training', color = 'red', size = 12.5, alpha = 0.8, 
          horizontalalignment = 'center', verticalalignment = 'center')    
    
axes[1].arrow( date[370+25], -0.3, 20, 0, head_width = 0.07, head_length = 5.5, 
           linewidth = 2., alpha = 0.8, color = 'blue', length_includes_head = True)
axes[1].arrow( date[370-25], -0.3, -20, 0,head_width = 0.07, head_length = 5.5, 
           linewidth = 2., alpha = 0.8, color = 'blue', length_includes_head = True)
axes[1].text( date[370], -0.3, 'Validation', color = 'blue', size = 12.5, alpha = 0.8, 
          horizontalalignment = 'center', verticalalignment = 'center')

In [None]:
ratio_data = 0.8
N_ratio    = int(N_stat*ratio_data)

## Data for train & validation
train_input_set_COVID  = train_stat_1_COVID_countries.T[0:N_ratio,:]
train_output_set_COVID = validation_stat_1_COVID_countries.T[0:N_ratio]

validation_input_set_COVID  = train_stat_1_COVID_countries.T[N_ratio:,:]
validation_output_set_COVID = validation_stat_1_COVID_countries.T[N_ratio:]

## Data Scaling for train & validation
type_scaler      = scl.MinMaxScaler()
type_scaler.fit(train_input_set_COVID)

train_input      = type_scaler.transform(train_input_set_COVID)
validation_input = type_scaler.transform(validation_input_set_COVID)

train_output =  train_output_set_COVID
validation_output = validation_output_set_COVID

## 5. Train the Decision Tree Model using Train & Validation Data

In [None]:
fig, axes = plt.subplots(1,1,figsize=(10,5))

iteration = 0 
for max_depth in [2,5,10]:
    iteration = iteration + 1
    
    clf               = tree.DecisionTreeRegressor(max_depth=max_depth)
    clf               = clf.fit(train_input, train_output)
    model_prediction  = clf.predict(train_input)

    N_point = 10 
    N_date = len(date_country)
    date = date_country[ind_date].values
#     date_name = date[N_ratio:N_date:int((N_date-N_ratio)/N_point)]
    date_name = date[0:N_ratio:int((N_ratio)/N_point)]


    if (iteration == 1): 
        axes.plot(date_country[ind_date][0:N_ratio],train_output,
                  'k.-',label='Data',markersize=12.5,alpha=0.75)
        
    axes.plot(date_country[ind_date][0:N_ratio],model_prediction,
              '.-',markersize=7.5,alpha=0.6,
             label='Decision Tree (max_depth = {})'.format(max_depth))

    corr_model  = np.corrcoef(clf.predict(train_input),
                              train_output.T)[0][1]

    print("Corr Skill (max_depth={}): {:4.2f}".format(max_depth,corr_model))
    
axes.set_ylabel("newConfirmed")
axes.set_xticks(date_name)
axes.set_xticklabels(date_name,rotation=45)
axes.set_title("Decision Tree Regression (Train)")
axes.legend()

In [None]:
max_depth = 5
fig, axes = plt.subplots(1,1,figsize=(10,5))

iteration = 0 
for max_depth in [2,5,10]:
    iteration = iteration + 1
    
    clf               = tree.DecisionTreeRegressor(max_depth=max_depth)
    clf               = clf.fit(train_input, train_output)
    model_prediction  = clf.predict(validation_input)

    N_point = 10 
    N_date = len(date_country)
    date = date_country[ind_date].values
    date_name = date[N_ratio:N_date:int((N_date-N_ratio)/N_point)]

    if (iteration == 1): 
        axes.plot(date_country[ind_date][N_ratio:],validation_output,
                  'k.-',label='Data',markersize=12.5,alpha=0.75)
        
    axes.plot(date_country[ind_date][N_ratio:],model_prediction,
              '.-',markersize=7.5,alpha=0.6,
             label='Decision Tree (max_depth = {})'.format(max_depth))

    corr_model  = np.corrcoef(clf.predict(validation_input),
                              validation_output.T)[0][1]

    print("Corr Skill (max_depth={}): {:4.2f}".format(max_depth,corr_model))
    
axes.set_ylabel("newConfirmed")
axes.set_xticks(date_name)
axes.set_xticklabels(date_name,rotation=45)
axes.set_title("Decision Tree Regression (Validation)")
axes.legend()
axes.grid(alpha=0.1)

## 6. Train the SVM Model using Train & Validation Data

In [None]:
kernel_type = ['rbf','linear','poly']
fig, axes = plt.subplots(1,1,figsize=(10,5))

N_point = 10 
N_date = len(date_country)
date = date_country[ind_date].values
date_name = date[N_ratio:N_date:int((N_date-N_ratio)/N_point)]

for it in range(len(kernel_type)):
    L_norm = np.linalg.norm(train_output,1)
    svr_rbf = SVR(kernel=kernel_type[it], C=L_norm, gamma='auto', epsilon=1)
    svr_rbf.fit(train_input,train_output)
    model_prediction = svr_rbf.predict(validation_input)

    if (it == 0): 
        axes.plot(date_country[ind_date][N_ratio:],validation_output,
                 'k.-',label='Data',markersize=12.5,alpha=0.75)
        
    axes.plot(date_country[ind_date][N_ratio:],model_prediction,
             '.-',markersize=7.5,alpha=0.6,
             label='SVM (kernel={})'.format(kernel_type[it]))

    corr_model  = np.corrcoef(svr_rbf.predict(validation_input),
                              validation_output.T)[0][1]

    print("Corr Skill (kernel={}): {:4.2f}".format(kernel_type[it],
                                                   corr_model))

axes.set_ylabel("newConfirmed")
axes.set_xticks(date_name)
axes.set_xticklabels(date_name,rotation=45)
axes.set_title("Support Vector Machine Regression (Validation)")
axes.legend()
axes.grid(alpha=0.1)

## 7. Train the KNN Model using Train & Validation Data

In [None]:
n_neighbors = [2,5,10]
fig, axes = plt.subplots(1,1,figsize=(10,5))

N_point = 10 
N_date = len(date_country)
date = date_country[ind_date].values
date_name = date[N_ratio:N_date:int((N_date-N_ratio)/N_point)]


for it in range(len(n_neighbors)):
    neigh = KNeighborsRegressor(n_neighbors=n_neighbors[it])
    neigh.fit(train_input,train_output)
    model_prediction = neigh.predict(validation_input)
    
    if (it == 0): 
        axes.plot(date_country[ind_date][N_ratio:],validation_output,
                 'k.-',label='Data',markersize=12.5,alpha=0.75)

    axes.plot(date_country[ind_date][N_ratio:],model_prediction,
             '.-',markersize=7.5,alpha=0.6,
             label='KNN (n-neighbor={})'.format(n_neighbors[it]))

    corr_model  = np.corrcoef(svr_rbf.predict(validation_input),
                              validation_output.T)[0][1]
    
    print("Corr Skill (n_neighbors={}): {:4.2f}".format(n_neighbors[it],
                                                   corr_model))
axes.set_ylabel("newConfirmed")
axes.set_xticks(date_name)
axes.set_xticklabels(date_name,rotation=45)
axes.set_title("KNN Regression (Validation)")
axes.legend()
axes.grid(alpha=0.1)