In [10]:
from github import Github
import os
import pickle
import base64
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
from ipywidgets import interact
import datetime as dt
sns.set()

In [11]:
with open("github.txt") as myfile:
    firstNlines=myfile.readlines()[0:2]
myfile.close()
g = Github(firstNlines[0].strip(), firstNlines[1])

In [12]:
repo=g.get_repo('CSSEGISandData/COVID-19')
contents = repo.get_contents("")

In [13]:
def get_sha_for_tag(repository, tag):
    """
    Returns a commit PyGithub object for the specified repository and tag.
    """
    branches = repository.get_branches()
    matched_branches = [match for match in branches if match.name == tag]
    if matched_branches:
        return matched_branches[0].commit.sha

    tags = repository.get_tags()
    matched_tags = [match for match in tags if match.name == tag]
    if not matched_tags:
        raise ValueError('No Tag or Branch exists with that name')
    return matched_tags[0].commit.sha

In [14]:
def download_directory(repository, sha, server_path, local_path='data_csse/'):
    """
    Download all contents at server_path with commit tag sha in
    the repository.
    """
    contents = repository.get_contents(server_path, ref=sha)
    if not os.path.exists(local_path):
        os.makedirs(local_path)
    for content in contents:
        #print("Processing %s" % content.path)
        if content.type == 'dir':
            download_directory(repository, sha, content.path)
        else:
            try:
                path = content.path
                file_content = repository.get_contents(path, ref=sha)
                file_data = base64.b64decode(file_content.content).decode('ascii')
                file_out = open(local_path+content.name, "w")
                file_out.write(local_path+file_data)
                file_out.close()
            except:
                pass

In [15]:
sha = get_sha_for_tag(repo, 'master')

In [16]:
download_directory(repo, sha, 'csse_covid_19_data/csse_covid_19_time_series/')

In [17]:
df = pd.read_csv('data_csse/02-01-2020.csv')

FileNotFoundError: [Errno 2] File data_csse/02-01-2020.csv does not exist: 'data_csse/02-01-2020.csv'

In [None]:
from os import listdir
from os.path import isfile, join
mypath = 'data_csse/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [None]:
onlyfiles

In [None]:
def is_non_zero_file(fpath):
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0

In [None]:
timeseries_files = []
for file in onlyfiles:
    if is_non_zero_file(mypath + file) and file[:14]=='time_series_19':
        timeseries_files.append(mypath + file)

In [None]:
timeseries_files

In [21]:
def preprocess_df(df, name):
    df.drop(columns=['data_csse/Province/State', 'Lat', 'Long'], inplace=True)
    df = df.groupby(['Country/Region']).agg('sum')
    df = df.transpose().reset_index()
    country_list = list(df.columns)[1:]
    df = pd.melt(df, id_vars='index', value_vars=country_list)
    df = df.rename(columns={'index':'Date', 'value':name})
    return df

In [22]:
confirmed_df = pd.read_csv('data_csse/time_series_19-covid-Confirmed.csv')
deaths_df = pd.read_csv('data_csse/time_series_19-covid-Deaths.csv')
recovered_df = pd.read_csv('data_csse/time_series_19-covid-Recovered.csv')

In [23]:
confirmed_df = preprocess_df(confirmed_df, 'Confirmed')
deaths_df = preprocess_df(deaths_df, 'Deaths')
recovered_df = preprocess_df(recovered_df, 'Recovered')

In [24]:
confirmed_df

Unnamed: 0,Date,Country/Region,Confirmed
0,1/22/20,Afghanistan,0.0
1,1/23/20,Afghanistan,0.0
2,1/24/20,Afghanistan,0.0
3,1/25/20,Afghanistan,0.0
4,1/26/20,Afghanistan,0.0
...,...,...,...
11341,3/19/20,Zimbabwe,0.0
11342,3/20/20,Zimbabwe,1.0
11343,3/21/20,Zimbabwe,3.0
11344,3/22/20,Zimbabwe,3.0


In [25]:
confirmed_and_deaths = pd.merge(confirmed_df, deaths_df, how='inner', on=['Date', 'Country/Region'])


In [26]:
grouped_df = pd.merge(confirmed_and_deaths, recovered_df, how='inner', on=['Date', 'Country/Region'])

In [27]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11346 entries, 0 to 11345
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            11346 non-null  object 
 1   Country/Region  11346 non-null  object 
 2   Confirmed       11346 non-null  float64
 3   Deaths          11346 non-null  float64
 4   Recovered       11346 non-null  float64
dtypes: float64(3), object(2)
memory usage: 531.8+ KB


In [28]:
grouped_df['Active'] = grouped_df['Confirmed']-grouped_df['Deaths']-grouped_df['Recovered']

In [29]:
grouped_df['Date'] = grouped_df['Date'].apply(lambda x: pd.to_datetime(x))

In [30]:
pop_df = pd.read_csv('pop_df.csv')

In [31]:
pop_df['Country/Region'] = pop_df['Country/Region'].apply(lambda x: x.replace('Mainland China', 'China'))

In [32]:
pop_df.drop(columns=['Unnamed: 0'], inplace=True)

In [33]:
len(grouped_df['Country/Region'].unique())

183

In [34]:
test_df = pd.merge(grouped_df, pop_df, how='outer', on='Country/Region')
test_df['Country/Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Congo (Brazzaville)', 'Congo (Kinshasa)',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cruise Ship', 'Cuba',
       'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'French Guiana', 'Gabon', 'Gambia, The', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Greenland', 'Grenada', 'Guadelo

In [35]:
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Mainland China', 'China')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Hong Kong SAR', 'Hong Kong')
grouped_df['Country/Region'] = grouped_df['Country/Region'].replace(['Korea, South', 'Republic of Korea'], 'South Korea')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('United Kingdom', 'UK')
grouped_df['Country/Region'] = grouped_df['Country/Region'].replace(['Taiwan*', 'Taipei and environs'], 'Taiwan')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Iran (Islamic Republic of)', 'Iran')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Viet Nam', 'Vietnam')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Macao SAR', 'Macau')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Republic of Ireland', 'Ireland')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Czechia', 'Czech Republic')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('occupied Palestinian territory', 'Palestine')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Russian Federation', 'Russia')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace(' Azerbaijan', 'Azerbaijan')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Holy See', 'Vatican City')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Republic of Moldova', 'Moldova')
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('Saint Martin', 'St. Martin')

In [36]:
set(grouped_df['Country/Region']) - set(pop_df['Country/Region'])

{'Angola',
 'Antigua and Barbuda',
 'Bahamas, The',
 'Barbados',
 'Benin',
 'Cabo Verde',
 'Cape Verde',
 'Central African Republic',
 'Chad',
 'Congo (Brazzaville)',
 'Djibouti',
 'Dominica',
 'East Timor',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Gabon',
 'Gambia, The',
 'Ghana',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guernsey',
 'Guinea',
 'Haiti',
 'Jersey',
 'Kazakhstan',
 'Kenya',
 'Kosovo',
 'Kyrgyzstan',
 'Liberia',
 'Madagascar',
 'Mauritania',
 'Mauritius',
 'Mayotte',
 'Montenegro',
 'Mozambique',
 'Namibia',
 'Nicaragua',
 'Niger',
 'Papua New Guinea',
 'Puerto Rico',
 'Republic of the Congo',
 'Rwanda',
 'Saint Lucia',
 'Saint Vincent and the Grenadines',
 'Seychelles',
 'Somalia',
 'Sudan',
 'Suriname',
 'Syria',
 'Tanzania',
 'The Bahamas',
 'The Gambia',
 'Timor-Leste',
 'Trinidad and Tobago',
 'Uganda',
 'Uruguay',
 'Uzbekistan',
 'Venezuela',
 'Zambia',
 'Zimbabwe'}

In [37]:
pop_df['Country/Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Argentina',
       'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain',
       'Bangladesh', 'Belarus', 'Belgium', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada',
       'Channel Islands', 'Chile', 'Colombia', 'Congo (Kinshasa)',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cruise Ship', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic',
       'Ecuador', 'Egypt', 'Estonia', 'Faroe Islands', 'Finland',
       'France', 'French Guiana', 'Georgia', 'Germany', 'Gibraltar',
       'Greece', 'Guyana', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland',
       'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kuwait', 'Latvia', 'Lebanon',
       'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macau', 'China',
       'Malaysia', 'Maldives', 'Malta', 'Martinique

In [38]:
pop_df['Country/Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Argentina',
       'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain',
       'Bangladesh', 'Belarus', 'Belgium', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada',
       'Channel Islands', 'Chile', 'Colombia', 'Congo (Kinshasa)',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cruise Ship', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic',
       'Ecuador', 'Egypt', 'Estonia', 'Faroe Islands', 'Finland',
       'France', 'French Guiana', 'Georgia', 'Germany', 'Gibraltar',
       'Greece', 'Guyana', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland',
       'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kuwait', 'Latvia', 'Lebanon',
       'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macau', 'China',
       'Malaysia', 'Maldives', 'Malta', 'Martinique

In [39]:
grouped_df['Country/Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Congo (Brazzaville)', 'Congo (Kinshasa)',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cruise Ship', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'French Guiana', 'Gabon', 'Gambia, The', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Greenland', 'Grenada', '

In [40]:
grouped_df = pd.merge(grouped_df, pop_df, on='Country/Region')

In [41]:
grouped_df['Confirmed Cases Per 1K'] = grouped_df['Confirmed']/grouped_df['PopTotal']

In [42]:
grouped_df.loc[grouped_df['Country/Region']=='China']

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,PopTotal,Confirmed Cases Per 1K
1488,2020-01-22,China,548.0,17.0,28.0,503.0,1439323.774,0.000381
1489,2020-01-23,China,643.0,18.0,30.0,595.0,1439323.774,0.000447
1490,2020-01-24,China,920.0,26.0,36.0,858.0,1439323.774,0.000639
1491,2020-01-25,China,1406.0,42.0,39.0,1325.0,1439323.774,0.000977
1492,2020-01-26,China,2075.0,56.0,49.0,1970.0,1439323.774,0.001442
...,...,...,...,...,...,...,...,...
1545,2020-03-19,China,81156.0,3249.0,70535.0,7372.0,1439323.774,0.056385
1546,2020-03-20,China,81250.0,3253.0,71266.0,6731.0,1439323.774,0.056450
1547,2020-03-21,China,81305.0,3259.0,71857.0,6189.0,1439323.774,0.056488
1548,2020-03-22,China,81397.0,3265.0,72362.0,5770.0,1439323.774,0.056552


In [43]:
grouped_df.loc[grouped_df['Country/Region']=='China']

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,PopTotal,Confirmed Cases Per 1K
1488,2020-01-22,China,548.0,17.0,28.0,503.0,1439323.774,0.000381
1489,2020-01-23,China,643.0,18.0,30.0,595.0,1439323.774,0.000447
1490,2020-01-24,China,920.0,26.0,36.0,858.0,1439323.774,0.000639
1491,2020-01-25,China,1406.0,42.0,39.0,1325.0,1439323.774,0.000977
1492,2020-01-26,China,2075.0,56.0,49.0,1970.0,1439323.774,0.001442
...,...,...,...,...,...,...,...,...
1545,2020-03-19,China,81156.0,3249.0,70535.0,7372.0,1439323.774,0.056385
1546,2020-03-20,China,81250.0,3253.0,71266.0,6731.0,1439323.774,0.056450
1547,2020-03-21,China,81305.0,3259.0,71857.0,6189.0,1439323.774,0.056488
1548,2020-03-22,China,81397.0,3265.0,72362.0,5770.0,1439323.774,0.056552


In [44]:
@interact(Country = grouped_df['Country/Region'].sort_values().unique(), Confirmed=True, Active=True, Recovered=True, Deaths=True)
def viz(Country, Confirmed, Active, Recovered, Deaths):
    fig = px.line(title='Cases Over Time')
    if Confirmed:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Confirmed'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Confirmed'], mode='lines', name='Confirmed Cases')
    if Active:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Active'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Active'], mode='lines', name='Active Cases')
    if Recovered:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Recovered'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Recovered'], mode='lines', name='Recovered Cases')
    if Deaths:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Deaths'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Deaths'], mode='lines', name='Deceased Cases')
    fig.update_xaxes(title='Date', range=[dt.date(2020, 1, 22), dt.datetime.now()])
    fig.show()

interactive(children=(Dropdown(description='Country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

In [45]:
grouped_df['Date'] = grouped_df['Date'].apply(lambda x: str(x))

In [46]:
fig = px.scatter_geo(grouped_df, locations='Country/Region', locationmode='country names',
                     hover_data=['Confirmed'], size='Confirmed Cases Per 1K', animation_frame='Date')
fig.show()


invalid value encountered in greater_equal


invalid value encountered in less_equal



ValueError: 
    Invalid element(s) received for the 'size' property of scattergeo.marker
        Invalid elements include: [nan]

    The 'size' property is a number and may be specified as:
      - An int or float in the interval [0, inf]
      - A tuple, list, or one-dimensional numpy array of the above

In [47]:
pickle_out = open('COVID_Hopkins_df.pickle', 'wb')
pickle.dump(grouped_df, pickle_out)
pickle_out.close()

In [48]:
grouped_df.head()

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,PopTotal,Confirmed Cases Per 1K
0,2020-01-22 00:00:00,Afghanistan,0.0,0.0,0.0,0.0,38928.341,0.0
1,2020-01-23 00:00:00,Afghanistan,0.0,0.0,0.0,0.0,38928.341,0.0
2,2020-01-24 00:00:00,Afghanistan,0.0,0.0,0.0,0.0,38928.341,0.0
3,2020-01-25 00:00:00,Afghanistan,0.0,0.0,0.0,0.0,38928.341,0.0
4,2020-01-26 00:00:00,Afghanistan,0.0,0.0,0.0,0.0,38928.341,0.0
