In [75]:
import os
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [76]:
daily_data = pd.read_csv('data/daily.csv')
govt_measures = pd.read_excel('data/acaps_covid19_government_measures_dataset.xlsx', sheet_name='Database')

In [77]:
# Convert date to datetime object 
daily_data['date'] = pd.to_datetime(daily_data['date'], format='%Y%m%d')

In [78]:
daily_data.head()

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,lastModified,total,totalTestResults,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash
0,2020-07-24,56,4092928,45745166,3409.0,59670.0,289355.0,10288.0,13182.0,2712.0,...,2020-07-24T00:00:00Z,49841503,49838094,49838094,1178,3026,854645,75193,929838,12df42387588e4187c573031bd050ea724720a82
1,2020-07-23,56,4017735,44890521,3179.0,59885.0,286329.0,10466.0,12933.0,2472.0,...,2020-07-23T00:00:00Z,48911435,48908256,48908256,1039,2500,753372,71027,824399,c7d29139144a768dd70fb305be3246f8c5b25a42
2,2020-07-22,56,3946708,44137149,3051.0,59675.0,283829.0,10457.0,12790.0,2440.0,...,2020-07-22T00:00:00Z,48086908,48083857,48083857,1117,2248,726352,69150,795502,c28c2ce1af7dcb4a6dfc783c222acb7387b19277
3,2020-07-21,56,3877558,43410797,3197.0,59362.0,281581.0,6711.0,12629.0,2414.0,...,2020-07-21T00:00:00Z,47291552,47288355,47288355,1038,2551,686606,63642,750248,b315275d3d2251da5af96422594a53d6d0508aad
4,2020-07-20,56,3813916,42724191,3003.0,58371.0,279030.0,6558.0,12475.0,2403.0,...,2020-07-20T00:00:00Z,46541110,46538107,46538107,362,1647,654044,56971,711015,6223e4e1a7003f68d8bd1f30742ea4d8e283b834


In [79]:
govt_measures.head()

Unnamed: 0,ID,COUNTRY,ISO,ADMIN_LEVEL_NAME,PCODE,REGION,LOG_TYPE,CATEGORY,MEASURE,TARGETED_POP_GROUP,COMMENTS,NON_COMPLIANCE,DATE_IMPLEMENTED,SOURCE,SOURCE_TYPE,LINK,ENTRY_DATE,Alternative source
0,1,Afghanistan,AFG,,,Asia,Introduction / extension of measures,Public health measures,Health screenings in airports and border cross...,No,,,2020-02-12,Ministry of Health,Government,https://moph.gov.af/en/moph-held-emergency-mee...,2020-03-14,
1,2,Afghanistan,AFG,Kabul,,Asia,Introduction / extension of measures,Public health measures,Isolation and quarantine policies,No,,,2020-02-12,Ministry of Health,Government,https://moph.gov.af/en/moph-held-emergency-mee...,2020-03-14,
2,3,Afghanistan,AFG,,,Asia,Introduction / extension of measures,Public health measures,Awareness campaigns,No,,,2020-02-12,Ministry of Health,Government,https://moph.gov.af/en/moph-held-emergency-mee...,2020-03-14,
3,4,Afghanistan,AFG,,,Asia,Introduction / extension of measures,Governance and socio-economic measures,Emergency administrative structures activated ...,No,,,2020-02-12,Ministry of Health,Government,https://moph.gov.af/en/moph-held-emergency-mee...,2020-03-14,
4,5,Afghanistan,AFG,,,Asia,Introduction / extension of measures,Social distancing,Limit public gatherings,No,Nevruz festival cancelled,,2020-03-12,AA,Media,https://www.aa.com.tr/en/asia-pacific/coronavi...,2020-03-14,


In [80]:
# Filter only for United States
govt_measures = govt_measures[govt_measures['COUNTRY'] == 'United States of America']

In [81]:
daily_data.merge(govt_measures, left_on='date', right_on='DATE_IMPLEMENTED', how='left')

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,MEASURE,TARGETED_POP_GROUP,COMMENTS,NON_COMPLIANCE,DATE_IMPLEMENTED,SOURCE,SOURCE_TYPE,LINK,ENTRY_DATE,Alternative source
0,2020-07-24,56,4092928,45745166,3409.0,59670.0,289355.0,10288.0,13182.0,2712.0,...,,,,,NaT,,,,NaT,
1,2020-07-23,56,4017735,44890521,3179.0,59885.0,286329.0,10466.0,12933.0,2472.0,...,,,,,NaT,,,,NaT,
2,2020-07-22,56,3946708,44137149,3051.0,59675.0,283829.0,10457.0,12790.0,2440.0,...,,,,,NaT,,,,NaT,
3,2020-07-21,56,3877558,43410797,3197.0,59362.0,281581.0,6711.0,12629.0,2414.0,...,,,,,NaT,,,,NaT,
4,2020-07-20,56,3813916,42724191,3003.0,58371.0,279030.0,6558.0,12475.0,2403.0,...,,,,,NaT,,,,NaT,
5,2020-07-19,56,3756945,42070147,3052.0,57885.0,277383.0,6384.0,12393.0,2362.0,...,,,,,NaT,,,,NaT,
6,2020-07-18,56,3692061,41336523,3032.0,57645.0,276439.0,6396.0,12342.0,2343.0,...,,,,,NaT,,,,NaT,
7,2020-07-17,56,3626881,40640071,3002.0,57761.0,274436.0,6453.0,12243.0,2353.0,...,,,,,NaT,,,,NaT,
8,2020-07-16,56,3549648,39879912,2929.0,57442.0,271758.0,6359.0,12091.0,2317.0,...,,,,,NaT,,,,NaT,
9,2020-07-15,56,3478695,39123301,2947.0,56144.0,269543.0,6317.0,12002.0,2317.0,...,,,,,NaT,,,,NaT,


In [82]:
daily_data.replace(to_replace = np.nan, value = 0, inplace=True)

In [83]:
daily_data.drop(columns=['dateChecked','lastModified','hash'], inplace=True)

In [84]:
#daily_data['date'] = pd.to_datetime(daily_data.date , format = '%d/%m/%Y %H.%M.%S')
data = daily_data.drop(['date'], axis=1)
data.index = daily_data.date

In [85]:
# x=data.drop(columns=['dateChecked','lastModified','hash'],axis=1)
# x.dropna(inplace=True)
x = data.copy()
cols = x.columns
x = x.loc[:, (x != x.iloc[0]).any()]

In [86]:
train = x[:int(0.8*(len(x)))]
valid = x[int(0.8*(len(x))):]

In [87]:
x.dtypes

states                        int64
positive                      int64
negative                      int64
pending                     float64
hospitalizedCurrently       float64
hospitalizedCumulative      float64
inIcuCurrently              float64
inIcuCumulative             float64
onVentilatorCurrently       float64
onVentilatorCumulative      float64
recovered                   float64
death                       float64
hospitalized                float64
total                         int64
totalTestResults              int64
posNeg                        int64
deathIncrease                 int64
hospitalizedIncrease          int64
negativeIncrease              int64
positiveIncrease              int64
totalTestResultsIncrease      int64
dtype: object

In [88]:
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()



In [89]:
prediction = model_fit.forecast(model_fit.y, steps=len(valid))

In [92]:
pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
for j in range(0,13):
    for i in range(0, len(prediction)):
       pred.iloc[i][j] = prediction[i][j]

#check rmse
#for i in cols:
print('rmse value for Positive cases is : ', math.sqrt(mean_squared_error(pred['positive'], valid['positive'])))

rmse value for Positive cases is :  20319.394612544194


In [None]:
model = VAR(endog=x)
model_fit = model.fit()
yhat = model_fit.forecast(model_fit.y, steps=30)
print(yhat)

In [None]:
forecast = []
for i in yhat:
    forecast.append(i[0])

In [None]:
pd.Series(forecast).plot()