In [209]:
import pandas as pd
import numpy as np
import itertools
import os
import warnings  
warnings.filterwarnings('ignore')

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"


from sklearn.linear_model import LinearRegression  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, r2_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler





In [129]:
stat_filename = '../data/coronavirus-disease-covid-19-statistics-and-research.csv'

measures_filename = '../data/acaps_covid19_government_measures.xlsx'

cols = ['iso_code','continent','location','date'
         ,'total_cases','new_cases','total_deaths',
         'new_deaths','total_cases_per_million',
         'new_cases_per_million','total_deaths_per_million',
         'new_deaths_per_million','new_tests',
         'total_tests','total_tests_per_thousand',
         'new_tests_per_thousand','new_tests_smoothed',
         'new_tests_smoothed_per_thousand',
         'tests_units','stringency_index',
         'population','population_density',
         'median_age','aged_65_older','aged_70_older',
         'gdp_per_capita','extreme_poverty',
         'cardiovasc_death_rate','diabetes_prevalence',
         'female_smokers','male_smokers',
         'handwashing_facilities',
         'hospital_beds_per_thousand','life_expectancy']

df_stats =  pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv',delimiter=',')
df_stats = df_stats[cols] 
df_measures = pd.read_excel('https://www.acaps.org/sites/acaps/files/resources/files/acaps_covid19_government_measures_dataset_0.xlsx', header=0,sheet_name='Dataset')

df_confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
df_deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
df_recoveries = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

print("Shape for Statistics DataFrame:", df_stats.shape)
print("Shape for Measures DataFrame:", df_measures.shape)

print("Shape for Confirmed DataFrame:", confirmed_df.shape)
print("Shape for Deaths DataFrame:", deaths_df.shape)
print("Shape for Recovories DataFrame:", recoveries_df.shape)


Shape for Statistics DataFrame: (166098, 34)
Shape for Measures DataFrame: (23923, 18)
Shape for Confirmed DataFrame: (284, 777)
Shape for Deaths DataFrame: (284, 777)
Shape for Recovories DataFrame: (269, 777)


In [130]:
df_confirmed.sample(10)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22,3/1/22,3/2/22,3/3/22,3/4/22
264,Channel Islands,United Kingdom,49.3723,-2.3644,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
232,,Slovakia,48.669,19.699,0,0,0,0,0,0,...,2052260,2071372,2083963,2102420,2112272,2119358,2134795,2150666,2164751,2178234
89,Unknown,China,,,0,0,0,0,0,0,...,0,0,0,0,58,0,0,0,0,0
238,,Spain,40.463667,-3.74922,0,0,0,0,0,0,...,10914105,10949997,10977524,10977524,10977524,10977524,11036085,11054888,11078028,11100428
166,,Latvia,56.8796,24.6032,0,0,0,0,0,0,...,612920,624008,633960,644077,649504,653120,662644,673218,682118,689794
70,Henan,China,37.8957,114.9042,5,5,9,32,83,128,...,2664,2664,2664,2664,2664,2666,2667,2672,2677,2680
209,,Panama,8.538,-80.7821,0,0,0,0,0,0,...,752907,753694,754362,754905,755262,755498,755853,756085,756539,757061
28,,Bolivia,-16.2902,-63.5887,0,0,0,0,0,0,...,891851,891851,892537,893048,893223,893395,893512,893775,894200,894200
33,,Bulgaria,42.7339,25.4858,0,0,0,0,0,0,...,1080571,1083425,1086328,1087796,1088520,1091279,1093920,1096194,1097298,1099423
120,French Guiana,France,3.9339,-53.1258,0,0,0,0,0,0,...,77575,77607,77625,77625,77625,77652,77652,77652,77733,77765


In [190]:
df_confirmed.describe()

Unnamed: 0,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22,3/1/22,3/2/22,3/3/22,3/4/22
count,282.0,282.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,...,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0
mean,20.106368,21.958718,1.961268,2.31338,3.323944,5.059859,7.464789,10.31338,19.647887,21.721831,...,1513551.0,1519732.0,1525337.0,1530016.0,1533766.0,1538708.0,1544120.0,1549935.0,1556581.0,1562567.0
std,25.841453,75.893366,26.402603,26.499491,32.992032,45.919425,64.175242,86.46777,212.163754,213.470672,...,6095036.0,6105564.0,6115582.0,6122978.0,6127430.0,6136422.0,6143999.0,6152129.0,6162227.0,6171151.0
min,-71.9499,-178.1165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.643279,-22.03655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8011.0,8012.5,8015.75,8020.25,8195.75,8240.0,8241.5,8241.5,8243.0,8248.75
50%,21.607878,20.921188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,66346.5,66965.0,68733.0,68737.0,68742.0,68758.5,68779.0,70036.5,70060.0,70076.0
75%,40.950592,84.992575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,744624.8,745329.2,745994.2,746532.8,747096.0,747644.8,750341.2,758979.5,763860.5,764927.0
max,71.7069,178.065,444.0,444.0,549.0,761.0,1058.0,1423.0,3554.0,3554.0,...,78738940.0,78806290.0,78881160.0,78929800.0,78947870.0,79044330.0,79091360.0,79143720.0,79196010.0,79250510.0


In [131]:
df_deaths.sample(10)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22,3/1/22,3/2/22,3/3/22,3/4/22
279,,West Bank and Gaza,31.9522,35.2332,0,0,0,0,0,0,...,5439,5458,5458,5458,5458,5501,5513,5532,5542,5557
265,Falkland Islands (Malvinas),United Kingdom,-51.7963,-59.5236,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
226,,Saudi Arabia,23.885942,45.079162,0,0,0,0,0,0,...,8990,8991,8993,8994,8996,8998,9001,9002,9004,9005
149,,Indonesia,-0.7893,113.9213,0,0,0,0,0,0,...,147025,147342,147586,147844,148073,148335,148660,149036,149268,149596
9,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,33,33,33,33,34,34,34,34,34,34
54,Saskatchewan,Canada,52.9399,-106.4509,0,0,0,0,0,0,...,1054,1054,1091,1091,1091,1091,1091,1091,1091,1135
276,,Vanuatu,-15.3767,166.9592,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
84,Shanghai,China,31.202,121.4491,0,0,0,0,1,1,...,7,7,7,7,7,7,7,7,7,7
45,New Brunswick,Canada,46.5653,-66.4619,0,0,0,0,0,0,...,300,300,301,301,301,304,306,306,308,309
202,,Niger,17.607789,8.081666,0,0,0,0,0,0,...,306,306,306,306,307,307,307,307,307,307


In [132]:
df_recoveries.sample(10)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22,3/1/22,3/2/22,3/3/22,3/4/22
125,,Guinea,9.9456,-9.6966,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
152,,Lebanon,33.8547,35.8623,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27,,Bhutan,27.5142,90.4336,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
115,Wallis and Futuna,France,-14.2938,-178.1165,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,,Barbados,13.1939,-59.5432,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,,Estonia,58.5953,25.0136,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
187,,Niger,17.607789,8.081666,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,Zhejiang,China,29.1832,120.0934,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
264,,West Bank and Gaza,31.9522,35.2332,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [186]:
df_confirmed_group = df_confirmed.groupby(by='Country/Region',as_index=False).sum()
df_deaths_group = df_deaths.groupby(by='Country/Region',as_index=False).sum()
df_recoveries_group = df_recoveries.groupby(by='Country/Region',as_index=False).sum()


df_active_group = pd.concat([df_deaths_group.iloc[: , :3],pd.DataFrame(df_confirmed_group.iloc[: , 3:] - (df_recoveries_group.iloc[: ,3:] +  df_deaths_group.iloc[:, 3:]))],axis=1)

In [202]:
stats = pd.DataFrame(columns=['Date','Confirmed','Deaths','Recovered','Active'])
stats['Date'] = df_confirmed.columns[4:]

stats['Confirmed'] = stats['Date'].apply(lambda x: df_confirmed[x].sum())
stats['Deaths'] = stats['Date'].apply(lambda x: df_deaths[x].sum())
stats['Recovered'] = stats['Date'].apply(lambda x: df_recoveries[x].sum())
stats.reset_index(drop=False, inplace=True)
stats['Active'] = stats['index'].apply(lambda x: (stats['Confirmed'][x]-(stats['Recovered'][x]+stats['Deaths'][x])))
stats.drop('index', axis = 1, inplace=True)
stats.head()

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Active
0,1/22/20,557,17,30,510
1,1/23/20,657,18,32,607
2,1/24/20,944,26,39,879
3,1/25/20,1437,42,42,1353
4,1/26/20,2120,56,56,2008


In [215]:
latest_stats_fig = go.Figure()
latest_stats_fig.add_trace(go.Treemap(labels = ['Confirmed','Active','Recovered','Deaths'],
                                     parents = ['','Confirmed','Confirmed','Confirmed'],
                                     values = [stats['Confirmed'].sum(), stats['Active'].sum(), stats['Recovered'].sum(), stats['Deaths'].sum()],
                                      branchvalues="total", marker_colors = ['#073b4c','#ef476f','#06d6a0','#118ab2'],
                                      textinfo = "label+value",
                                      outsidetextfont = {"size": 30, "color": "darkblue"},
                                      marker = {"line": {"width": 2}},
                                        pathbar = {"visible": True}
                                     ))
latest_stats_fig.update_layout(height=300)
latest_stats_fig.show()