In [1]:
import os
import sys
os.chdir('/Users/somanathnanda/baba/Covid-19/')

import pandas as pd
pd.options.display.max_columns=None
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

from plotly.offline import init_notebook_mode, iplot
import warnings

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
warnings.filterwarnings('ignore')
init_notebook_mode(connected=True)

import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)


from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import time
start = time.process_time()

In [2]:
def plot_timeseries(x_axis,data,name):
    num_cols=data.shape[1]
    go_arr=[]
    for i in range(num_cols):
        go_arr.append(go.Scatter(x=x_axis, y=data.iloc[:,i],name=data.columns[i]))   
    fig=go.Figure(go_arr)
    #fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)','paper_bgcolor': 'rgba(0, 0, 1, 0)',})
    fig.update_layout(
    title=name,font=dict(
        family="Courier New, monospace",
        color="#7f7f6a"))
    fig.show()
def plot_lines(df, title='', annotations=None):
    common_kw = dict(x=df.index, mode='lines+markers')
    xaxis = dict(title='Time Steps')
    data=[]
    for c in df.columns:
        data.append(go.Scatter(y=df[c], name=c, **common_kw))
    #data = [go.Scatter(y=df[c], name=c, **common_kw) for c in df.columns]
    layout = dict(title=title, showlegend=True, annotations=annotations, xaxis=xaxis)
    fig=dict(data=data, layout=layout)
    iplot(fig,show_link=True)

In [3]:
confirmed_data_df=pd.read_csv('data/time_series_2019-ncov-Confirmed.csv')
recovered_data_df=pd.read_csv('data/time_series_2019-ncov-Recovered.csv')
death_data_df=pd.read_csv('data/time_series_2019-ncov-Deaths.csv')

country_pred_df=pd.DataFrame()
model_performance_df=pd.DataFrame()

def get_time_series_data(country_name):
    grouped_confirmed_df=confirmed_data_df.groupby('Country/Region')
    grouped_recovered_df=recovered_data_df.groupby('Country/Region')
    grouped_death_df=death_data_df.groupby('Country/Region')
    country_confirmed_df=grouped_confirmed_df.get_group(country_name)
    country_recovered_df=grouped_recovered_df.get_group(country_name)
    country_death_df=grouped_death_df.get_group(country_name)
    cleaned_confirmed_df=country_confirmed_df.drop(columns=['Province/State','Lat','Long']).sum(axis=0,skipna=True).to_frame()[1:]
    cleaned_recovered_df=country_recovered_df.drop(columns=['Province/State','Lat','Long']).sum(axis=0,skipna=True).to_frame()[1:]
    cleaned_death_df=country_death_df.drop(columns=['Province/State','Lat','Long']).sum(axis=0,skipna=True).to_frame()[1:]
    ts_df=pd.DataFrame(cleaned_confirmed_df.values,columns=['confirmed_count'])
    ts_df['recovered_count']=cleaned_recovered_df.values
    ts_df['death_count']=cleaned_death_df.values
    ts_df.index=cleaned_confirmed_df.index
    return ts_df
def plot(ts_df,country_name):
    #plot_timeseries(x_axis=ts_df.index,data=ts_df[['confirmed_count','recovered_count','death_count']],name='count_comparison_'+country_name)
    plot_lines(ts_df,'count_comparison_'+country_name)

# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        end_ix = i + n_steps
        if end_ix > len(sequence)-1:
            break
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.asarray(X), np.asarray(y) 

def build_model_and_predict(ts_df, country_name):
    n_steps = 4  # Take 4 steps input and predict for 1 day 
    n_features = 1
    n_test = 4
    X, Y = split_sequence(ts_df.confirmed_count.values.astype('float32'), n_steps)
    X = X.reshape((X.shape[0], X.shape[1], n_features))
    #prepare train and test dataset.. Last 4 days are the Validation Datatset
    X_train, X_test, Y_train, Y_test = X[:-n_test], X[-n_test:], Y[:-n_test], Y[-n_test:]  
    c = [
        ModelCheckpoint(country_name+'.hdf5', save_best_only=True, monitor='val_loss', mode='min', verbose=0, save_freq=1),
        EarlyStopping(monitor='val_loss', min_delta=0, patience=600, verbose=0),
        ReduceLROnPlateau(monitor='val_loss',factor=0.2,patience=5,min_lr=0.00001)
    ]
    model = Sequential()
    model.add(LSTM(100, activation='relu', kernel_initializer='he_normal', input_shape=(n_steps,1)))
    model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(1))
    # compile the model
    model.compile(optimizer='adam', loss='mse', metrics=['mae','mse','accuracy'])
    # fit the model
    history=model.fit(X_train, Y_train, epochs=4000, batch_size=32, verbose=0,callbacks=c,validation_data=(X_test, Y_test),shuffle=False)
    # evaluate the model
    [mse, mae, loss, accuracy]= model.evaluate(X_test, Y_test, batch_size=32, verbose=0)
    #print('MSE: %.3f, RMSE: %.3f, MAE: %.3f, ACCURACY: %.3f ' % (mse, np.sqrt(mse), mae,accuracy))
    model_performance=pd.DataFrame({'Country':[country_name],'mae':np.array(mae),'mse':np.array(mse),
                                    'rmse':np.array(np.sqrt(mse)),'accuracy':np.array(accuracy)})
    row = np.asarray(ts_df[-n_steps:].confirmed_count.values.astype('float32')).reshape((1, n_steps, n_features))
    yhat = model.predict(row)
    return yhat, model_performance

In [None]:
#country_names=confirmed_data_df['Country/Region'].unique()
country_names=['India','Ireland','Germany','Spain','Australia','Denmark','Switzerland','Belgium','Iran','France','Netherlands','Slovakia','Austria','Israel',
               'Canada','Singapore','Japan','Sweden','Italy','Portugal','Brazil','US','Argentina','Latvia','Norway','Malaysia','United Kingdom']
for country_name in country_names:
    ts_df=get_time_series_data(country_name)
    yhat, model_performance = build_model_and_predict(ts_df,country_name)
    prediction=round(yhat[0][0])
    model_performance_df=model_performance_df.append(model_performance)
    country_pred_df=country_pred_df.append(pd.DataFrame({'Country':[country_name],'LastValue':ts_df[-1:].confirmed_count.values,'PredictedValue':np.array(prediction)}))
    

In [5]:
for country_name in country_names:
    ts_df=get_time_series_data(country_name)
    plot(ts_df,country_name)

In [6]:
country_pred_df=country_pred_df.reset_index(drop=True)
country_pred_df=country_pred_df.sort_values(by='PredictedValue',ascending=False)
fig = go.Figure()
fig.add_trace(go.Bar(x=country_pred_df['Country'].values,
                y=country_pred_df['LastValue'].values,
                name='# Cases Yesterday (Actual)',
                marker_color='rgb(55, 83, 109)'
                ))
fig.add_trace(go.Bar(x=country_pred_df['Country'].values,
                y=country_pred_df['PredictedValue'].values,
                name='# Cases Today (Predicted)',
                marker_color='rgb(26, 118, 255)'
                ))

fig.update_layout(
    title='LSTM Model to Predict Country wise case count for the next day',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='No. Of Cases',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

In [20]:
country_pred_df=country_pred_df.sort_values(by='Country')
country_pred_df.index=country_pred_df['Country']
country_pred_df=country_pred_df.drop(columns=['Country'])

In [21]:
country_pred_df

Unnamed: 0_level_0,LastValue,PredictedValue
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,158,195.0
Australia,1071,1301.0
Austria,2814,3751.0
Belgium,2815,3342.0
Brazil,1021,1268.0
Canada,1278,1646.0
Denmark,1420,1490.0
France,14431,19375.0
Germany,22213,30957.0
India,330,286.0


In [8]:
plot_lines(country_pred_df,'Yesterday vs Today')

#### Compare the model accuracy for each country

In [9]:
model_performance_df.index=model_performance_df['Country'].values
model_performance_df=model_performance_df.drop(columns=['Country'])
plot_lines(model_performance_df)

In [10]:
model_performance_df

Unnamed: 0,mae,mse,rmse,accuracy
India,27.631603,1519.729,38.983708,0.0
Ireland,76.268501,11037.89,105.061366,0.0
Germany,910.125244,2341896.0,1530.325619,0.0
Spain,647.991943,495098.0,703.632028,0.0
Australia,67.827209,7541.45,86.841523,0.0
Denmark,44.40921,2183.351,46.726342,0.0
Switzerland,874.508789,833006.1,912.691692,0.0
Belgium,60.342712,5638.912,75.09269,0.0
Iran,20329.28125,416116300.0,20398.928599,0.0
France,1189.057373,1876714.0,1369.932206,0.0


In [11]:
print('Total Time taken : '+str((time.process_time() - start)/60)+' mins')

Total Time taken : 220.01297325 mins
