In [1]:
!pip install PyGithub



In [1]:
from github import Github
import os
import pickle
import base64
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
from ipywidgets import interact
import datetime as dt
sns.set()

In [2]:
with open("github.txt") as myfile:
    firstNlines=myfile.readlines()[0:2]
myfile.close()
g = Github(firstNlines[0].strip(), firstNlines[1])

In [3]:
repo=g.get_repo('CSSEGISandData/COVID-19')
contents = repo.get_contents("")

In [4]:
def get_sha_for_tag(repository, tag):
    """
    Returns a commit PyGithub object for the specified repository and tag.
    """
    branches = repository.get_branches()
    matched_branches = [match for match in branches if match.name == tag]
    if matched_branches:
        return matched_branches[0].commit.sha

    tags = repository.get_tags()
    matched_tags = [match for match in tags if match.name == tag]
    if not matched_tags:
        raise ValueError('No Tag or Branch exists with that name')
    return matched_tags[0].commit.sha

In [5]:
def download_directory(repository, sha, server_path, local_path='data_csse/'):
    """
    Download all contents at server_path with commit tag sha in
    the repository.
    """
    contents = repository.get_contents(server_path, ref=sha)
    if not os.path.exists(local_path):
        os.makedirs(local_path)
    for content in contents:
        #print("Processing %s" % content.path)
        if content.type == 'dir':
            download_directory(repository, sha, content.path)
        else:
            try:
                path = content.path
                file_content = repository.get_contents(path, ref=sha)
                file_data = base64.b64decode(file_content.content).decode('ascii')
                file_out = open(local_path+content.name, "w")
                file_out.write(local_path+file_data)
                file_out.close()
            except:
                pass

In [6]:
sha = get_sha_for_tag(repo, 'master')

In [7]:
download_directory(repo, sha, 'csse_covid_19_data/csse_covid_19_daily_reports')

In [8]:
df = pd.read_csv('data_csse/02-01-2020.csv')

In [9]:
from os import listdir
from os.path import isfile, join
mypath = 'data_csse/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [10]:
def is_non_zero_file(fpath):
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0

In [11]:
nonzero_files = []
for file in onlyfiles:
    if is_non_zero_file(mypath + file) and file[-3:]=='csv':
        nonzero_files.append(mypath + file)

In [12]:
nonzero_files

['data_csse/02-01-2020.csv',
 'data_csse/02-02-2020.csv',
 'data_csse/02-03-2020.csv',
 'data_csse/02-04-2020.csv',
 'data_csse/02-05-2020.csv',
 'data_csse/02-06-2020.csv',
 'data_csse/02-07-2020.csv',
 'data_csse/02-08-2020.csv',
 'data_csse/02-09-2020.csv',
 'data_csse/02-10-2020.csv',
 'data_csse/02-11-2020.csv',
 'data_csse/02-12-2020.csv',
 'data_csse/02-13-2020.csv',
 'data_csse/02-14-2020.csv',
 'data_csse/02-15-2020.csv',
 'data_csse/02-16-2020.csv',
 'data_csse/02-17-2020.csv',
 'data_csse/02-18-2020.csv',
 'data_csse/02-19-2020.csv',
 'data_csse/02-20-2020.csv',
 'data_csse/02-21-2020.csv',
 'data_csse/02-22-2020.csv',
 'data_csse/02-23-2020.csv',
 'data_csse/02-24-2020.csv',
 'data_csse/02-25-2020.csv',
 'data_csse/02-26-2020.csv',
 'data_csse/02-27-2020.csv',
 'data_csse/02-28-2020.csv',
 'data_csse/02-29-2020.csv',
 'data_csse/03-01-2020.csv',
 'data_csse/03-02-2020.csv',
 'data_csse/03-03-2020.csv',
 'data_csse/03-04-2020.csv',
 'data_csse/03-05-2020.csv',
 'data_csse/03

In [17]:
df_list = []
for file in nonzero_files:
    df_1 = pd.read_csv(file)
    df_1['filename'] = file[-15:]
    df_list.append(df_1)  

In [18]:
df = pd.concat(df_list, ignore_index=True)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [19]:
df['Country/Region'] = df['Country/Region'].str.replace('Mainland China', 'China')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5167 entries, 0 to 5166
Data columns (total 9 columns):
Confirmed         5167 non-null int64
Country/Region    5167 non-null object
Deaths            5167 non-null int64
Last Update       5167 non-null object
Latitude          2842 non-null float64
Longitude         2842 non-null float64
Province/State    3072 non-null object
Recovered         5167 non-null int64
filename          5167 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 363.4+ KB


In [21]:
df['Date'] = pd.to_datetime(df['Last Update'])

In [22]:
df['Diamond Princess'] = df['Province/State'].str.contains('Diamond Princess')

In [23]:
df.loc[df['Province/State']=='From Diamond Princess']

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,filename,Date,Diamond Princess
1499,4,Australia,0,2020-02-21T23:03:13,,,From Diamond Princess,0,\02-21-2020.csv,2020-02-21 23:03:13,True
1518,1,Israel,0,2020-02-21T15:33:03,,,From Diamond Princess,0,\02-21-2020.csv,2020-02-21 15:33:03,True
1580,7,Australia,0,2020-02-22T17:03:05,,,From Diamond Princess,0,\02-22-2020.csv,2020-02-22 17:03:05,True
1602,1,Israel,0,2020-02-22T20:53:02,,,From Diamond Princess,0,\02-22-2020.csv,2020-02-22 20:53:02,True
1664,7,Australia,0,2020-02-22T17:03:05,,,From Diamond Princess,0,\02-23-2020.csv,2020-02-22 17:03:05,True
1686,1,Israel,0,2020-02-22T20:53:02,,,From Diamond Princess,0,\02-23-2020.csv,2020-02-22 20:53:02,True
1749,7,Australia,0,2020-02-22T17:03:05,,,From Diamond Princess,0,\02-24-2020.csv,2020-02-22 17:03:05,True
1773,1,Israel,0,2020-02-22T20:53:02,,,From Diamond Princess,0,\02-24-2020.csv,2020-02-22 20:53:02,True
1841,7,Australia,0,2020-02-22T17:03:05,,,From Diamond Princess,0,\02-25-2020.csv,2020-02-22 17:03:05,True
1867,1,Israel,0,2020-02-22T20:53:02,,,From Diamond Princess,0,\02-25-2020.csv,2020-02-22 20:53:02,True


In [24]:
df.drop(columns=['filename'], inplace=True)

In [25]:
df = df.drop_duplicates()

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3120 entries, 0 to 5163
Data columns (total 10 columns):
Confirmed           3120 non-null int64
Country/Region      3120 non-null object
Deaths              3120 non-null int64
Last Update         3120 non-null object
Latitude            1817 non-null float64
Longitude           1817 non-null float64
Province/State      1935 non-null object
Recovered           3120 non-null int64
Date                3120 non-null datetime64[ns]
Diamond Princess    1935 non-null object
dtypes: datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 268.1+ KB


In [27]:
df['Province/State'] = df['Province/State'].str.replace(' \(From Diamond Princess\)', '').replace('None', 'From Diamond Princess')

In [28]:
df.head()

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,Date,Diamond Princess
0,7153,China,249,2/1/2020 11:53,,,Hubei,168,2020-02-01 11:53:00,False
1,599,China,0,2/1/2020 10:53,,,Zhejiang,21,2020-02-01 10:53:00,False
2,535,China,0,2/1/2020 14:23,,,Guangdong,14,2020-02-01 14:23:00,False
3,422,China,2,2/1/2020 1:52,,,Henan,3,2020-02-01 01:52:00,False
4,389,China,0,2/1/2020 11:03,,,Hunan,8,2020-02-01 11:03:00,False


In [29]:
df['DateTime'] = df['Date']

In [30]:
df['Date'] = df['DateTime'].apply(lambda x: x.date())

In [31]:
df['Date'] = df['Date'].apply(lambda x: pd.to_datetime(x))

In [32]:
df['Date'] = df['Date'].apply(lambda x: str(x))

In [33]:
# faulty data, 1/31 has only 38 confirmed cases in China. We'll start data using 2/1.
jan_31 = df[df['Date']=='2020-01-31 00:00:00'].index
df.drop(jan_31, inplace=True)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3098 entries, 0 to 5163
Data columns (total 11 columns):
Confirmed           3098 non-null int64
Country/Region      3098 non-null object
Deaths              3098 non-null int64
Last Update         3098 non-null object
Latitude            1817 non-null float64
Longitude           1817 non-null float64
Province/State      1931 non-null object
Recovered           3098 non-null int64
Date                3098 non-null object
Diamond Princess    1931 non-null object
DateTime            3098 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(3), object(5)
memory usage: 290.4+ KB


In [35]:
df.head()

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,Date,Diamond Princess,DateTime
0,7153,China,249,2/1/2020 11:53,,,Hubei,168,2020-02-01 00:00:00,False,2020-02-01 11:53:00
1,599,China,0,2/1/2020 10:53,,,Zhejiang,21,2020-02-01 00:00:00,False,2020-02-01 10:53:00
2,535,China,0,2/1/2020 14:23,,,Guangdong,14,2020-02-01 00:00:00,False,2020-02-01 14:23:00
3,422,China,2,2/1/2020 1:52,,,Henan,3,2020-02-01 00:00:00,False,2020-02-01 01:52:00
4,389,China,0,2/1/2020 11:03,,,Hunan,8,2020-02-01 00:00:00,False,2020-02-01 11:03:00


In [36]:
df['Active'] = df['Confirmed']-df['Deaths']-df['Recovered']

In [37]:
df.loc[df['Country/Region']=='South Africa']

Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,Date,Diamond Princess,DateTime,Active
3048,1,South Africa,0,2020-03-05T13:53:03,-30.5595,22.9375,,0,2020-03-05 00:00:00,,2020-03-05 13:53:03,1
3236,1,South Africa,0,2020-03-06T15:43:02,-30.5595,22.9375,,0,2020-03-06 00:00:00,,2020-03-06 15:43:02,1
3641,3,South Africa,0,2020-03-08T10:23:04,-30.5595,22.9375,,0,2020-03-08 00:00:00,,2020-03-08 10:23:04,3
4140,7,South Africa,0,2020-03-10T05:13:07,-30.5595,22.9375,,0,2020-03-10 00:00:00,,2020-03-10 05:13:07,7
4333,13,South Africa,0,2020-03-11T10:33:03,-30.5595,22.9375,,0,2020-03-11 00:00:00,,2020-03-11 10:33:03,13
4552,17,South Africa,0,2020-03-12T09:13:24,-30.5595,22.9375,,0,2020-03-12 00:00:00,,2020-03-12 09:13:24,17
4764,38,South Africa,0,2020-03-14T16:14:56,-30.5595,22.9375,,0,2020-03-14 00:00:00,,2020-03-14 16:14:56,38
5005,51,South Africa,0,2020-03-15T18:20:18,-30.5595,22.9375,,0,2020-03-15 00:00:00,,2020-03-15 18:20:18,51


In [38]:
df['Log Confirmed'] = np.log10(df['Confirmed']+1)

In [39]:
grouped_df = df.groupby(['Date', 'Country/Region']).agg({'Confirmed': 'sum', 'Recovered': 'sum',
                                                    'Active': 'sum', 'Deaths': 'sum'}).reset_index()

In [40]:
grouped_df['Log Confirmed'] = np.log10(grouped_df['Confirmed']+1)

In [41]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1355 entries, 0 to 1354
Data columns (total 7 columns):
Date              1355 non-null object
Country/Region    1355 non-null object
Confirmed         1355 non-null int64
Recovered         1355 non-null int64
Active            1355 non-null int64
Deaths            1355 non-null int64
Log Confirmed     1355 non-null float64
dtypes: float64(1), int64(4), object(2)
memory usage: 74.2+ KB


1. Check for each country if it has entries for every date
2. If country is missing entry for that date, use most previous entry

In [42]:
country_list = grouped_df['Country/Region'].unique()

In [43]:
country_list

array(['Australia', 'Canada', 'China', 'France', 'Germany', 'Hong Kong',
       'Japan', 'Philippines', 'Singapore', 'South Korea', 'Spain',
       'Sweden', 'UK', 'US', 'Vietnam', 'India', 'Macau',
       'United Arab Emirates', 'Belgium', 'Malaysia', 'Taiwan',
       'Thailand', 'Italy', 'Others', 'Sri Lanka', 'Cambodia', 'Finland',
       'Nepal', 'Russia', 'Egypt', 'Iran', 'Israel', 'Lebanon', 'Iraq',
       'Afghanistan', 'Bahrain', 'Kuwait', 'Oman', 'Algeria', 'Austria',
       'Croatia', 'Switzerland', 'Brazil', 'Georgia', 'Greece',
       'North Macedonia', 'Norway', 'Pakistan', 'Romania', 'Denmark',
       'Estonia', 'Netherlands', 'San Marino', ' Azerbaijan', 'Belarus',
       'Iceland', 'Lithuania', 'Mexico', 'New Zealand', 'Nigeria',
       'North Ireland', 'Ireland', 'Luxembourg', 'Monaco', 'Qatar',
       'Armenia', 'Azerbaijan', 'Czech Republic', 'Dominican Republic',
       'Ecuador', 'Andorra', 'Indonesia', 'Latvia', 'Morocco', 'Portugal',
       'Saudi Arabia', 'Seneg

In [44]:
total_dates = list(set(grouped_df['Date'].unique()))

In [45]:
temp_dict = {}
for country in country_list:
    date_list = list(grouped_df[grouped_df['Country/Region']==country]['Date'])
    missing_dates = list(set(total_dates)-set(date_list))
    temp_dict[country]=missing_dates
    
#     df_temp = pd.DataFrame({'Country/Region':country, 'Date':missing_dates})
#     grouped_df_2.append(df_temp, ignore_index=True)
    

In [46]:
for key in temp_dict:
    print(key)

Australia
Canada
China
France
Germany
Hong Kong
Japan
Philippines
Singapore
South Korea
Spain
Sweden
UK
US
Vietnam
India
Macau
United Arab Emirates
Belgium
Malaysia
Taiwan
Thailand
Italy
Others
Sri Lanka
Cambodia
Finland
Nepal
Russia
Egypt
Iran
Israel
Lebanon
Iraq
Afghanistan
Bahrain
Kuwait
Oman
Algeria
Austria
Croatia
Switzerland
Brazil
Georgia
Greece
North Macedonia
Norway
Pakistan
Romania
Denmark
Estonia
Netherlands
San Marino
 Azerbaijan
Belarus
Iceland
Lithuania
Mexico
New Zealand
Nigeria
North Ireland
Ireland
Luxembourg
Monaco
Qatar
Armenia
Azerbaijan
Czech Republic
Dominican Republic
Ecuador
Andorra
Indonesia
Latvia
Morocco
Portugal
Saudi Arabia
Senegal
Argentina
Chile
Jordan
Ukraine
Faroe Islands
Gibraltar
Hungary
Liechtenstein
Poland
Saint Barthelemy
Tunisia
Bosnia and Herzegovina
Palestine
Slovenia
South Africa
Bhutan
Cameroon
Colombia
Costa Rica
Peru
Serbia
Slovakia
Togo
Vatican City
French Guiana
Malta
Martinique
Bangladesh
Bulgaria
Maldives
Moldova
Paraguay
Republic of Ire

In [47]:
missing_dates_df = pd.DataFrame.from_dict(temp_dict, orient='index')

In [48]:
missing_dates_df = missing_dates_df.reset_index()

In [49]:
missing_dates_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,33,34,35,36,37,38,39,40,41,42
0,Australia,2020-02-28 00:00:00,2020-02-03 00:00:00,2020-02-08 00:00:00,2020-02-25 00:00:00,2020-02-12 00:00:00,2020-02-10 00:00:00,2020-02-23 00:00:00,2020-02-16 00:00:00,2020-02-15 00:00:00,...,,,,,,,,,,
1,Canada,2020-02-18 00:00:00,2020-02-06 00:00:00,2020-02-02 00:00:00,2020-02-11 00:00:00,2020-02-09 00:00:00,2020-02-03 00:00:00,2020-02-19 00:00:00,2020-02-22 00:00:00,2020-02-08 00:00:00,...,,,,,,,,,,
2,China,,,,,,,,,,...,,,,,,,,,,
3,France,2020-02-02 00:00:00,2020-02-09 00:00:00,2020-02-03 00:00:00,2020-02-21 00:00:00,2020-02-10 00:00:00,2020-02-23 00:00:00,2020-02-04 00:00:00,2020-02-07 00:00:00,2020-02-22 00:00:00,...,,,,,,,,,,
4,Germany,2020-02-06 00:00:00,2020-03-13 00:00:00,2020-02-04 00:00:00,2020-02-19 00:00:00,2020-02-17 00:00:00,2020-02-24 00:00:00,2020-02-22 00:00:00,2020-02-05 00:00:00,2020-02-08 00:00:00,...,,,,,,,,,,


In [50]:
missing_dates_df = pd.melt(missing_dates_df, id_vars='index')

In [51]:
missing_dates_df.drop(columns=['variable'], inplace=True)

In [52]:
missing_dates_df

Unnamed: 0,index,value
0,Australia,2020-02-28 00:00:00
1,Canada,2020-02-18 00:00:00
2,China,
3,France,2020-02-02 00:00:00
4,Germany,2020-02-06 00:00:00
...,...,...
7477,Central African Republic,2020-03-11 00:00:00
7478,Congo (Brazzaville),2020-03-11 00:00:00
7479,Equatorial Guinea,2020-03-11 00:00:00
7480,Kosovo,2020-03-11 00:00:00


In [53]:
missing_dates_df = missing_dates_df.rename(columns={'index': 'Country/Region', 'value': 'Date'})

In [54]:
missing_dates_df.loc[missing_dates_df['Country/Region']=='China']

Unnamed: 0,Country/Region,Date
2,China,
176,China,
350,China,
524,China,
698,China,
872,China,
1046,China,
1220,China,
1394,China,
1568,China,


In [55]:
grouped_df = pd.concat([grouped_df, missing_dates_df], ignore_index=True)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [56]:
grouped_df = grouped_df.sort_values(by=['Country/Region', 'Date'])

In [57]:
mask = grouped_df['Date']=='2020-02-01 00:00:00'
grouped_df = grouped_df.mask(mask, grouped_df.fillna(0))

In [58]:
grouped_df.head()

Unnamed: 0,Active,Confirmed,Country/Region,Date,Deaths,Log Confirmed,Recovered
4714,0.0,0.0,Azerbaijan,2020-02-01 00:00:00,0.0,0.0,0.0
1408,,,Azerbaijan,2020-02-02 00:00:00,,,
1930,,,Azerbaijan,2020-02-03 00:00:00,,,
4192,,,Azerbaijan,2020-02-04 00:00:00,,,
6976,,,Azerbaijan,2020-02-05 00:00:00,,,


In [59]:
grouped_df = grouped_df.ffill()

In [60]:
grouped_df = grouped_df.drop_duplicates()

In [61]:
grouped_df.loc[grouped_df['Country/Region']=='China'].describe()

Unnamed: 0,Active,Confirmed,Deaths,Log Confirmed,Recovered
count,44.0,44.0,44.0,44.0,44.0
mean,34379.75,62081.681818,2022.681818,4.726193,25679.25
std,16057.363982,23329.955951,1023.902538,0.322321,22051.323779
min,11.0,1427.0,9.0,3.154728,273.0
25%,21607.0,43743.0,1086.0,4.640823,4444.75
50%,34082.5,75234.5,2337.5,4.87642,20627.0
75%,49976.5,78433.5,2949.0,4.894507,47502.0
max,58041.0,80921.0,3188.0,4.908067,63089.0


In [62]:
grouped_df.head()

Unnamed: 0,Active,Confirmed,Country/Region,Date,Deaths,Log Confirmed,Recovered
4714,0.0,0.0,Azerbaijan,2020-02-01 00:00:00,0.0,0.0,0.0
1408,0.0,0.0,Azerbaijan,2020-02-02 00:00:00,0.0,0.0,0.0
1930,0.0,0.0,Azerbaijan,2020-02-03 00:00:00,0.0,0.0,0.0
4192,0.0,0.0,Azerbaijan,2020-02-04 00:00:00,0.0,0.0,0.0
6976,0.0,0.0,Azerbaijan,2020-02-05 00:00:00,0.0,0.0,0.0


In [63]:
grouped_df.sort_values(['Country/Region', 'Date'], inplace=True)

In [64]:
grouped_df.loc[grouped_df['Date']=='2020-03-12 00:00:00'].describe()

Unnamed: 0,Active,Confirmed,Deaths,Log Confirmed,Recovered
count,174.0,174.0,174.0,174.0,174.0
mean,436.511494,860.316092,29.310345,1.188557,394.494253
std,1780.650458,6103.059954,248.102128,1.062961,4545.854892
min,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,0.30103,0.0
50%,8.0,10.0,0.0,1.041393,0.0
75%,51.0,60.5,1.0,1.788832,3.0
max,14812.0,77817.0,3135.0,4.89108,59870.0


In [65]:
grouped_df.loc[grouped_df['Date']=='2020-03-12 00:00:00'].describe()['Active']['max']

14812.0

In [66]:
grouped_df['Date'] = pd.to_datetime(grouped_df['Date'])

In [67]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7656 entries, 4714 to 8617
Data columns (total 7 columns):
Active            7656 non-null float64
Confirmed         7656 non-null float64
Country/Region    7656 non-null object
Date              7656 non-null datetime64[ns]
Deaths            7656 non-null float64
Log Confirmed     7656 non-null float64
Recovered         7656 non-null float64
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 478.5+ KB


In [68]:
def assign_bins(df):
    first_quartile = _df.loc[df['Date']==date].describe()['Active']['25%']
    second_quartile = df.loc[df['Date']==date].describe()['Active']['50%']
    third_quartile = df.loc[df['Date']==date].describe()['Active']['75%']
    fourth_quartile = df.loc[grouped_df['Date']==date].describe()['Active']['max']
for date in total_dates:
    first_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['25%']
    second_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['50%']
    third_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['75%']
    fourth_quartile = grouped_df.loc[grouped_df['Date']==date].describe()['Active']['max']
    if 0<cases<first_quartile:
        bins=1
    elif first_quartile<cases<second_quartile:
        bins=2
    elif second_quartile<cases<third_quartile:
        bins=3
    elif third_quartile<cases<=fourth_quartile:
        bins=4

NameError: name 'cases' is not defined

In [None]:
for date in total_dates:
    max_confirmed = grouped_df.loc[grouped_df['Date']==date]['Confirmed'].max()
    print(date, max_confirmed)

In [None]:
def bins_percentage(max_confirmed):

In [None]:
@interact(Country = grouped_df['Country/Region'].sort_values().unique(), Confirmed=True, Active=True, Recovered=True, Deaths=True)
def viz(Country, Confirmed, Active, Recovered, Deaths):
    fig = px.line(title='Cases Over Time')
    if Confirmed:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Confirmed'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Confirmed'], mode='lines', name='Confirmed Cases')
    if Active:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Active'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Active'], mode='lines', name='Active Cases')
    if Recovered:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Recovered'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Recovered'], mode='lines', name='Recovered Cases')
    if Deaths:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Deaths'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Deaths'], mode='lines', name='Deceased Cases')
    fig.update_xaxes(title='Date', range=[dt.date(2020, 2, 1), dt.datetime.now()])
    fig.show()

In [None]:
fig = px.scatter_geo(grouped_df, locations='Country/Region', locationmode='country names',
                     hover_data=['Confirmed'], size='Log Confirmed', animation_frame='Date')
fig.show()

In [None]:
pickle_out = open(r'C:\Users\brian\projects\COVID-19\csse_daily_reports\COVID_Hopkins_df.pickle', 'wb')
pickle.dump(grouped_df, pickle_out)
pickle_out.close()