<a href="https://colab.research.google.com/github/archita17/Covid-Data-Analysis/blob/main/Covid_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import math
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
tf=pd.read_csv('case_time_series (1).csv',)
print (tf)

                Date    Date_YMD  ...  Daily Deceased  Total Deceased
0    30 January 2020  2020-01-30  ...               0               0
1    31 January 2020  2020-01-31  ...               0               0
2    1 February 2020  2020-02-01  ...               0               0
3    2 February 2020  2020-02-02  ...               0               0
4    3 February 2020  2020-02-03  ...               0               0
..               ...         ...  ...             ...             ...
494      7 June 2021  2021-06-07  ...            2106          350744
495      8 June 2021  2021-06-08  ...            2222          352966
496      9 June 2021  2021-06-09  ...            6139          359105
497     10 June 2021  2021-06-10  ...            3414          362519
498     11 June 2021  2021-06-11  ...            3996          366515

[499 rows x 8 columns]


In [None]:
tf.columns

Index(['Date', 'Date_YMD', 'Daily Confirmed', 'Total Confirmed',
       'Daily Recovered', 'Total Recovered', 'Daily Deceased',
       'Total Deceased'],
      dtype='object')

In [None]:
tf.shape

(499, 8)

In [None]:
tf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Date             499 non-null    object
 1   Date_YMD         499 non-null    object
 2   Daily Confirmed  499 non-null    int64 
 3   Total Confirmed  499 non-null    int64 
 4   Daily Recovered  499 non-null    int64 
 5   Total Recovered  499 non-null    int64 
 6   Daily Deceased   499 non-null    int64 
 7   Total Deceased   499 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 31.3+ KB


In [None]:
# checking missing values in training dataset
tf.isnull().sum()

Date               0
Date_YMD           0
Daily Confirmed    0
Total Confirmed    0
Daily Recovered    0
Total Recovered    0
Daily Deceased     0
Total Deceased     0
dtype: int64

In [None]:
tf.describe()

Unnamed: 0,Daily Confirmed,Total Confirmed,Daily Recovered,Total Recovered,Daily Deceased,Total Deceased
count,499.0,499.0,499.0,499.0,499.0,499.0
mean,58834.324649,7215531.0,55917.977956,6512819.0,734.498998,97768.639279
std,87896.903383,7495691.0,85655.05492,6813472.0,1055.537607,88785.399163
min,0.0,1.0,0.0,0.0,0.0,0.0
25%,8647.0,212031.5,4650.5,102169.5,104.0,5958.5
50%,26227.0,6682322.0,22730.0,5659286.0,384.0,103015.0
75%,67427.5,10833130.0,61851.0,10527280.0,881.0,154484.0
max,414280.0,29358330.0,422391.0,27903070.0,6139.0,366515.0


In [None]:
tf.dtypes

Date               object
Date_YMD           object
Daily Confirmed     int64
Total Confirmed     int64
Daily Recovered     int64
Total Recovered     int64
Daily Deceased      int64
Total Deceased      int64
dtype: object

In [None]:
#Feature Engineering
tf['Date'] = pd.to_datetime(tf['Date'])
tf['Date_YMD'] = pd.to_datetime(tf['Date_YMD'])

In [None]:
tf.dtypes

Date               datetime64[ns]
Date_YMD           datetime64[ns]
Daily Confirmed             int64
Total Confirmed             int64
Daily Recovered             int64
Total Recovered             int64
Daily Deceased              int64
Total Deceased              int64
dtype: object

Performing datewise analysis of data

In [None]:
acctodate=tf.groupby(["Date"]).agg({"Total Confirmed":'sum',"Total Recovered":'sum',"Total Deceased":'sum',"Daily Confirmed":'sum',"Daily Recovered":'sum',"Daily Deceased":'sum'})
acctodate["Days Since"]=acctodate.index-acctodate.index.min()

In [None]:
print("Initial Info")
print("Total number of Confirmed Cases in India: ",acctodate["Total Confirmed"].iloc[-1])
print("Total number of Recovered Cases in India: ",acctodate["Total Recovered"].iloc[-1])
print("Total number of Deaths Cases in India: ",acctodate["Total Deceased"].iloc[-1])
print("Total number of Active Cases around the World: ",(acctodate["Total Confirmed"].iloc[-1]-acctodate["Total Recovered"].iloc[-1]-acctodate["Total Deceased"].iloc[-1]))
print("Total number of Closed Cases around the World: ",acctodate["Total Recovered"].iloc[-1]+acctodate["Total Deceased"].iloc[-1])
print("Approximate number of Confirmed Cases per Day: ",acctodate["Daily Confirmed"].iloc[-1])
print("Approximate number of Recovered Cases per Day: ",acctodate["Daily Recovered"].iloc[-1])
print("Approximate number of Death Cases per Day ",acctodate["Daily Deceased"].iloc[-1])
print("Approximate number of Confirmed Cases per hour: ",np.round(acctodate["Total Confirmed"].iloc[-1]/((acctodate.shape[0])*24)))
print("Approximate number of Recovered Cases per hour: ",np.round(acctodate["Total Recovered"].iloc[-1]/((acctodate.shape[0])*24)))
print("Approximate number of Death Cases per hour: ",np.round(acctodate["Total Deceased"].iloc[-1]/((acctodate.shape[0])*24)))

Initial Info
Total number of Confirmed Cases in India:  29358328
Total number of Recovered Cases in India:  27903071
Total number of Deaths Cases in India:  366515
Total number of Active Cases around the World:  1088742
Total number of Closed Cases around the World:  28269586
Approximate number of Confirmed Cases per Day:  84573
Approximate number of Recovered Cases per Day:  122680
Approximate number of Death Cases per Day  3996
Approximate number of Confirmed Cases per hour:  2451.0
Approximate number of Recovered Cases per hour:  2330.0
Approximate number of Death Cases per hour:  31.0


In [None]:
fig=px.bar(x=acctodate.index,y=acctodate["Total Confirmed"]-acctodate["Total Recovered"]-acctodate["Total Deceased"])
fig.update_layout(title="Distribution of Number of Active Cases", xaxis_title="Date",yaxis_title="Number of Cases",)
fig.show()

In [None]:
fig=px.bar(x=acctodate.index,y=acctodate["Total Recovered"]+acctodate["Total Deceased"])
fig.update_layout(title="Distribution of Number of Closed Cases", xaxis_title="Date",yaxis_title="Number of Cases")
fig.show()

In [None]:
#Calculating the Mortality Rate and Recovery Rate
acctodate["Mortality Rate"]=(acctodate["Total Deceased"]/acctodate["Total Confirmed"])*100
acctodate["Recovery Rate"]=(acctodate["Total Recovered"]/acctodate["Total Confirmed"])*100
acctodate["Active Cases"]=acctodate["Total Confirmed"]-acctodate["Total Recovered"]-acctodate
["Total Deceased"]
acctodate["Closed Cases"]=acctodate["Total Recovered"]+acctodate["Total Deceased"]

print("Average Mortality Rate",acctodate["Mortality Rate"].mean())
print("Average Recovery Rate",acctodate["Recovery Rate"].mean())

#Plotting Mortality and Recovery Rate 
fig = make_subplots(rows=2, cols=1,subplot_titles=("Recovery Rate", "Mortatlity Rate"))
fig.add_trace(go.Scatter(x=acctodate.index, y=acctodate["Recovery Rate"],name="Recovery Rate"), row=1, col=1)
fig.add_trace(go.Scatter(x=acctodate.index, y=acctodate["Mortality Rate"],name="Mortality Rate"),row=2, col=1)
fig.update_layout(height=1000,legend=dict(x=-0.1,y=1.2,traceorder="normal"))
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="Recovery Rate", row=1, col=1)
fig.update_xaxes(title_text="Date", row=1, col=2)
fig.update_yaxes(title_text="Mortality Rate", row=1, col=2)
fig.show()

Average Mortality Rate 1.7134288875187451
Average Recovery Rate 70.1719126128283


Build an ML model for predicting the number of total infections in India till 31st of July 2021.

Prediction Using linear regression

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
acctodate["Days Since"]=acctodate.index-acctodate.index[0]
acctodate["Days Since"]=acctodate["Days Since"].dt.days

In [None]:
X_train=acctodate.iloc[:int(acctodate.shape[0]*0.95)]
y_train=acctodate.iloc[int(acctodate.shape[0]*0.95):]
model_scores=[]

In [None]:
lin_reg=LinearRegression(normalize=True)
lin_reg.fit(np.array(X_train["Days Since"]).reshape(-1,1),np.array(X_train["Total Confirmed"]).reshape(-1,1))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [None]:
prediction_valid_linreg=lin_reg.predict(np.array(y_train["Days Since"]).reshape(-1,1))

In [None]:
from sklearn.metrics import mean_squared_error
model_scores.append(np.sqrt(mean_squared_error(y_train["Total Confirmed"],prediction_valid_linreg)))
print("Root Mean Square Error for Linear Regression: ",np.sqrt(mean_squared_error(y_train["Total Confirmed"],prediction_valid_linreg)))

Root Mean Square Error for Linear Regression:  11506955.673774183


In [None]:
plt.figure(figsize=(11,6))
prediction_linreg=lin_reg.predict(np.array(acctodate["Days Since"]).reshape(-1,1))
linreg_output=[]
for i in range(prediction_linreg.shape[0]):
    linreg_output.append(prediction_linreg[i][0])

fig=go.Figure()
fig.add_trace(go.Scatter(x=acctodate.index, y=acctodate["Total Confirmed"],mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=acctodate.index, y=linreg_output, mode='lines',name="Linear Regression Best Fit Line", line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Linear Regression Prediction", xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

<Figure size 792x432 with 0 Axes>

Prediction using polynomial regression

In [None]:
X_train=acctodate.iloc[:int(acctodate.shape[0]*0.85)]
y_train=acctodate.iloc[int(acctodate.shape[0]*0.85):]
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 8) 
p_train=poly.fit_transform(np.array(X_train["Days Since"]).reshape(-1,1))
p_valid=poly.fit_transform(np.array(y_train["Days Since"]).reshape(-1,1))
y=X_train["Total Confirmed"]
linreg=LinearRegression(normalize=True)
linreg.fit(p_train,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [None]:
prediction_poly=linreg.predict(p_valid)
rmse_poly=np.sqrt(mean_squared_error(y_train["Total Confirmed"],prediction_poly))
model_scores.append(rmse_poly)
print("Root Mean Squared Error for Polynomial Regression: ",rmse_poly)

Root Mean Squared Error for Polynomial Regression:  21557445.92714709


In [None]:
comp_data=poly.fit_transform(np.array(acctodate["Days Since"]).reshape(-1,1))
plt.figure(figsize=(11,6))
predictions_poly=linreg.predict(comp_data)

fig=go.Figure()
fig.add_trace(go.Scatter(x=acctodate.index, y=acctodate["Total Confirmed"], mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=acctodate.index, y=predictions_poly, mode='lines',name="Polynomial Regression Best Fit", line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Polynomial Regression Prediction", xaxis_title="Date",yaxis_title="Confirmed Cases", legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

<Figure size 792x432 with 0 Axes>

In [None]:
new_prediction_poly=[]
for i in range(1,60):
    new_date_poly=poly.fit_transform(np.array(acctodate["Days Since"].max()+i).reshape(-1,1))
    new_prediction_poly.append(linreg.predict(new_date_poly)[0])

Prediction using SVM

In [None]:
train_ml=acctodate.iloc[:int(acctodate.shape[0]*0.95)]
valid_ml=acctodate.iloc[int(acctodate.shape[0]*0.95):]

In [None]:
#Intializing SVR Model
from sklearn.svm import SVR
svm=SVR(C=1,degree=6,kernel='poly',epsilon=0.01)

In [None]:
#Fitting model on the training data
svm.fit(np.array(train_ml["Days Since"]).reshape(-1,1),np.array(train_ml["Total Confirmed"]).reshape(-1,1))

SVR(C=1, cache_size=200, coef0=0.0, degree=6, epsilon=0.01, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
prediction_valid_svm=svm.predict(np.array(valid_ml["Days Since"]).reshape(-1,1))
model_scores.append(np.sqrt(mean_squared_error(valid_ml["Total Confirmed"],prediction_valid_svm)))
print("Root Mean Square Error for Support Vectore Machine: ",np.sqrt(mean_squared_error(valid_ml["Total Confirmed"],prediction_valid_svm)))

Root Mean Square Error for Support Vectore Machine:  1256767.3670901721


In [None]:
plt.figure(figsize=(11,6))
prediction_svm=svm.predict(np.array(acctodate["Days Since"]).reshape(-1,1))
fig=go.Figure()
fig.add_trace(go.Scatter(x=acctodate.index, y=acctodate["Total Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=acctodate.index, y=prediction_svm,
                    mode='lines',name="Support Vector Machine Best fit Kernel",
                    line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Support Vectore Machine Regressor Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

<Figure size 792x432 with 0 Axes>

In [None]:
from datetime import timedelta
new_date=[]
new_prediction_lr=[]
new_prediction_svm=[]
for i in range(1,60):
    new_date.append(acctodate.index[-1]+timedelta(days=i))
    new_prediction_lr.append(lin_reg.predict(np.array(acctodate["Days Since"].max()+i).reshape(-1,1))[0][0])
    new_prediction_svm.append(svm.predict(np.array(acctodate["Days Since"].max()+i).reshape(-1,1))[0])

In [None]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)
model_predictions=pd.DataFrame(zip(new_date,new_prediction_lr,new_prediction_poly,new_prediction_svm),
                               columns=["Dates","Linear Regression Prediction","Polynonmial Regression Prediction","SVM Prediction"])
print(model_predictions)

        Dates  ...  SVM Prediction
0  2021-06-12  ... 32482003.679619
1  2021-06-13  ... 32851863.742817
2  2021-06-14  ... 33225440.957712
3  2021-06-15  ... 33602765.150788
4  2021-06-16  ... 33983866.327668
5  2021-06-17  ... 34368774.673826
6  2021-06-18  ... 34757520.555306
7  2021-06-19  ... 35150134.519444
8  2021-06-20  ... 35546647.295582
9  2021-06-21  ... 35947089.795793
10 2021-06-22  ... 36351493.115606
11 2021-06-23  ... 36759888.534724
12 2021-06-24  ... 37172307.517759
13 2021-06-25  ... 37588781.714949
14 2021-06-26  ... 38009342.962896
15 2021-06-27  ... 38434023.285288
16 2021-06-28  ... 38862854.893637
17 2021-06-29  ... 39295870.188008
18 2021-06-30  ... 39733101.757757
19 2021-07-01  ... 40174582.382261
20 2021-07-02  ... 40620345.031662
21 2021-07-03  ... 41070422.867601
22 2021-07-04  ... 41524849.243959
23 2021-07-05  ... 41983657.707600
24 2021-07-06  ... 42446881.999113
25 2021-07-07  ... 42914556.053557
26 2021-07-08  ... 43386714.001205
27 2021-07-09  ... 4

Holt's Linear Model(Time Series Forecasting)

In [None]:
model_train=acctodate.iloc[:int(acctodate.shape[0]*0.95)]
valid=acctodate.iloc[int(acctodate.shape[0]*0.95):]
y_pred=valid.copy()
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing
holt=Holt(np.asarray(model_train["Total Confirmed"])).fit(smoothing_level=0.4, smoothing_slope=0.4,optimized=False)
y_pred["Holt"]=holt.forecast(len(valid))
model_scores.append(np.sqrt(mean_squared_error(y_pred["Total Confirmed"],y_pred["Holt"])))
print("Root Mean Square Error Holt's Linear Model: ",np.sqrt(mean_squared_error(y_pred["Total Confirmed"],y_pred["Holt"])))

Root Mean Square Error Holt's Linear Model:  1734033.5169334854


In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Total Confirmed"],mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Total Confirmed"], mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["Holt"],mode='lines+markers',name="Prediction of Confirmed Cases",))
fig.update_layout(title="Confirmed Cases Holt's Linear Model Prediction",xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

In [None]:
holt_new_date=[]
holt_new_prediction=[]
for i in range(1,60):
    holt_new_date.append(acctodate.index[-1]+timedelta(days=i))
    holt_new_prediction.append(holt.forecast((len(valid)+i))[-1])

model_predictions["Holt's Linear Model Prediction"]=holt_new_prediction
model_predictions

Unnamed: 0,Dates,Linear Regression Prediction,Polynonmial Regression Prediction,SVM Prediction,Holt's Linear Model Prediction
0,2021-06-12,16871822.972927,-19155260.604785,32482003.679619,33095084.960713
1,2021-06-13,16912747.679185,-20547122.698402,32851863.742817,33395124.685721
2,2021-06-14,16953672.385443,-21987956.010198,33225440.957712,33695164.41073
3,2021-06-15,16994597.091701,-23479078.32068,33602765.150788,33995204.135739
4,2021-06-16,17035521.797959,-25021834.089545,33983866.327668,34295243.860748
5,2021-06-17,17076446.504217,-26617594.84753,34368774.673826,34595283.585756
6,2021-06-18,17117371.210475,-28267759.5922,34757520.555306,34895323.310765
7,2021-06-19,17158295.916734,-29973755.187723,35150134.519444,35195363.035774
8,2021-06-20,17199220.622992,-31737036.768657,35546647.295582,35495402.760783
9,2021-06-21,17240145.32925,-33559088.147768,35947089.795793,35795442.485791


Build an ML model for predicting the number of deaths in India till 31st of July 2021.

Holt's Linear Model(Time Series Forecasting)

In [None]:
model_train=acctodate.iloc[:int(acctodate.shape[0]*0.95)]
valid=acctodate.iloc[int(acctodate.shape[0]*0.95):]
y_pred=valid.copy()
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing
holt=Holt(np.asarray(model_train["Total Deceased"])).fit(smoothing_level=0.4, smoothing_slope=0.4,optimized=False)
y_pred["Holt"]=holt.forecast(len(valid))
model_scores.append(np.sqrt(mean_squared_error(y_pred["Total Deceased"],y_pred["Holt"])))
print("Root Mean Square Error Holt's Linear Model: ",np.sqrt(mean_squared_error(y_pred["Total Deceased"],y_pred["Holt"])))

Root Mean Square Error Holt's Linear Model:  7363.346136221813


In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Total Deceased"],mode='lines+markers',name="Train Data for Deceased Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Total Deceased"], mode='lines+markers',name="Validation Data for Deceased Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["Holt"],mode='lines+markers',name="Prediction of Deceased Cases",))
fig.update_layout(title="Deceased Cases Holt's Linear Model Prediction",xaxis_title="Date",yaxis_title="Deceased Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

In [None]:
from datetime import timedelta
newdate=[]
holtnew_pred=[]
for i in range(1,55):
    newdate.append(acctodate.index[-1]+timedelta(days=i))
    holtnew_pred.append(holt.forecast((len(y_train)+i))[-1])


In [None]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)
modelpredictions=pd.DataFrame(zip(newdate,holtnew_pred),columns=["Dates","holtnew_prediction"])
modelpredictions

Unnamed: 0,Dates,holtnew_prediction
0,2021-06-12,588777.241686
1,2021-06-13,592866.76288
2,2021-06-14,596956.284073
3,2021-06-15,601045.805267
4,2021-06-16,605135.326461
5,2021-06-17,609224.847655
6,2021-06-18,613314.368849
7,2021-06-19,617403.890042
8,2021-06-20,621493.411236
9,2021-06-21,625582.93243


Using information obtained from the above models and other data available on the link, predict the tentative date (or interval spanning a week) after which the infection spread (Active cases) is 10% of its peak value in India.

In [None]:
model_train=acctodate.iloc[:int(acctodate.shape[0]*0.95)]
valid=acctodate.iloc[int(acctodate.shape[0]*0.95):]
y_pred=valid.copy()
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing
holt=Holt(np.asarray(model_train["Daily Confirmed"])).fit(smoothing_level=0.4, smoothing_slope=0.4,optimized=False)
y_pred["Holt"]=holt.forecast(len(valid))
model_scores.append(np.sqrt(mean_squared_error(y_pred["Daily Confirmed"],y_pred["Holt"])))
print("Root Mean Square Error Holt's Linear Model: ",np.sqrt(mean_squared_error(y_pred["Daily Confirmed"],y_pred["Holt"])))

Root Mean Square Error Holt's Linear Model:  138612.75753688233


In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Daily Confirmed"],mode='lines+markers',name="Train Data for Deceased Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Daily Confirmed"], mode='lines+markers',name="Validation Data for Deceased Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["Holt"],mode='lines+markers',name="Prediction of Daily Confirmed Cases",))
fig.update_layout(title="Daily Confirmed Cases Holt's Linear Model Prediction",xaxis_title="Date",yaxis_title="Daily Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

In [None]:
from datetime import timedelta
newdate=[]
holtnew_pred=[]
for i in range(1,55):
    newdate.append(acctodate.index[-1]+timedelta(days=i))
    holtnew_pred.append(holt.forecast((len(y_train)+i))[-1])


In [None]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)
modelpredictions=pd.DataFrame(zip(newdate,holtnew_pred),columns=["Dates","holtnew_prediction"])
modelpredictions

Unnamed: 0,Dates,holtnew_prediction
0,2021-06-12,-1067455.225031
1,2021-06-13,-1085100.32116
2,2021-06-14,-1102745.41729
3,2021-06-15,-1120390.513419
4,2021-06-16,-1138035.609549
5,2021-06-17,-1155680.705678
6,2021-06-18,-1173325.801808
7,2021-06-19,-1190970.897938
8,2021-06-20,-1208615.994067
9,2021-06-21,-1226261.090197


In [None]:
peakvalue=tf['Daily Confirmed'].max()
print(peakvalue)

414280


In [None]:
#10% of highest peak
print(peakvalue*0.1)

41428.0


4. Build an ML model and predict infection rate for which data are available in the age group 35-50 till 31st of July 2021.

In [None]:
model_train=acctodate.iloc[:int(acctodate.shape[0]*0.95)]
valid=acctodate.iloc[int(acctodate.shape[0]*0.95):]
y_pred=valid.copy()
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing
agegrp35to50=1393409038*0.198
holt=Holt(np.asarray(model_train["Total Confirmed"])*0.3628/agegrp35to50).fit(smoothing_level=0.4, smoothing_slope=0.4,optimized=False)
y_pred["Holt"]=holt.forecast(len(valid))
model_scores.append(np.sqrt(mean_squared_error(y_pred["Total Confirmed"]*0.3628/agegrp35to50,y_pred["Holt"])))
print("Root Mean Square Error Holt's Linear Model: ",np.sqrt(mean_squared_error(y_pred["Daily Confirmed"],y_pred["Holt"])))

Root Mean Square Error Holt's Linear Model:  176512.048004016


In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Total Confirmed"]*0.3628/agegrp35to50,mode='lines+markers',name="Train Data for Deceased Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Total Confirmed"]*0.3628/agegrp35to50, mode='lines+markers',name="Validation Data for Deceased Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["Holt"],mode='lines+markers',name="Prediction of infection rate in age group 35-50",))
fig.update_layout(title="Infection rate in age group 35-50 Holt's Linear Model Prediction",xaxis_title="Date",yaxis_title="Total Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

In [None]:
from datetime import timedelta
newdate=[]
holtnew_pred=[]
for i in range(1,55):
    newdate.append(acctodate.index[-1]+timedelta(days=i))
    holtnew_pred.append(holt.forecast((len(y_train)+i))[-1])


In [None]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)
modelpredictions=pd.DataFrame(zip(newdate,holtnew_pred),columns=["Dates","holtnew_prediction"])
modelpredictions

Unnamed: 0,Dates,holtnew_prediction
0,2021-06-12,0.063247
1,2021-06-13,0.063642
2,2021-06-14,0.064036
3,2021-06-15,0.064431
4,2021-06-16,0.064826
5,2021-06-17,0.06522
6,2021-06-18,0.065615
7,2021-06-19,0.066009
8,2021-06-20,0.066404
9,2021-06-21,0.066798
