In [1]:
!pip install PyGithub



In [2]:
from github import Github
import os
import pickle
import base64
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
from ipywidgets import interact
import datetime as dt
sns.set()

In [3]:
with open("github.txt") as myfile:
    firstNlines=myfile.readlines()[0:2]
myfile.close()
g = Github(firstNlines[0].strip(), firstNlines[1])

In [4]:
repo=g.get_repo('CSSEGISandData/COVID-19')
contents = repo.get_contents("")

In [5]:
def get_sha_for_tag(repository, tag):
    """
    Returns a commit PyGithub object for the specified repository and tag.
    """
    branches = repository.get_branches()
    matched_branches = [match for match in branches if match.name == tag]
    if matched_branches:
        return matched_branches[0].commit.sha

    tags = repository.get_tags()
    matched_tags = [match for match in tags if match.name == tag]
    if not matched_tags:
        raise ValueError('No Tag or Branch exists with that name')
    return matched_tags[0].commit.sha

In [6]:
def download_directory(repository, sha, server_path, local_path='data_csse/'):
    """
    Download all contents at server_path with commit tag sha in
    the repository.
    """
    contents = repository.get_contents(server_path, ref=sha)
    if not os.path.exists(local_path):
        os.makedirs(local_path)
    for content in contents:
        #print("Processing %s" % content.path)
        if content.type == 'dir':
            download_directory(repository, sha, content.path)
        else:
            try:
                path = content.path
                file_content = repository.get_contents(path, ref=sha)
                file_data = base64.b64decode(file_content.content).decode('ascii')
                file_out = open(local_path+content.name, "w")
                file_out.write(local_path+file_data)
                file_out.close()
            except:
                pass

In [7]:
sha = get_sha_for_tag(repo, 'master')

In [8]:
download_directory(repo, sha, 'csse_covid_19_data/csse_covid_19_time_series/')

In [9]:
df = pd.read_csv('data_csse/02-01-2020.csv')

In [10]:
from os import listdir
from os.path import isfile, join
mypath = 'data_csse/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [11]:
onlyfiles

['.gitignore',
 '02-01-2020.csv',
 '02-02-2020.csv',
 '02-03-2020.csv',
 '02-04-2020.csv',
 '02-05-2020.csv',
 '02-06-2020.csv',
 '02-07-2020.csv',
 '02-08-2020.csv',
 '02-09-2020.csv',
 '02-10-2020.csv',
 '02-11-2020.csv',
 '02-12-2020.csv',
 '02-13-2020.csv',
 '02-14-2020.csv',
 '02-15-2020.csv',
 '02-16-2020.csv',
 '02-17-2020.csv',
 '02-18-2020.csv',
 '02-19-2020.csv',
 '02-20-2020.csv',
 '02-21-2020.csv',
 '02-22-2020.csv',
 '02-23-2020.csv',
 '02-24-2020.csv',
 '02-25-2020.csv',
 '02-26-2020.csv',
 '02-27-2020.csv',
 '02-28-2020.csv',
 '02-29-2020.csv',
 '03-01-2020.csv',
 '03-02-2020.csv',
 '03-03-2020.csv',
 '03-04-2020.csv',
 '03-05-2020.csv',
 '03-06-2020.csv',
 '03-07-2020.csv',
 '03-08-2020.csv',
 '03-09-2020.csv',
 '03-10-2020.csv',
 '03-11-2020.csv',
 '03-12-2020.csv',
 '03-14-2020.csv',
 '03-15-2020.csv',
 '03-16-2020.csv',
 '03-17-2020.csv',
 '03-18-2020.csv',
 '03-19-2020.csv',
 'README.md',
 'time_series_19-covid-Confirmed.csv',
 'time_series_19-covid-Deaths.csv',
 't

In [12]:
def is_non_zero_file(fpath):
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0

In [13]:
timeseries_files = []
for file in onlyfiles:
    if is_non_zero_file(mypath + file) and file[:14]=='time_series_19':
        timeseries_files.append(mypath + file)

In [14]:
timeseries_files

['data_csse/time_series_19-covid-Confirmed.csv',
 'data_csse/time_series_19-covid-Deaths.csv',
 'data_csse/time_series_19-covid-Recovered.csv']

In [15]:
def preprocess_df(df, name):
    df.drop(columns=['data_csse/Province/State', 'Lat', 'Long'], inplace=True)
    df = df.groupby(['Country/Region']).agg('sum')
    df = df.transpose().reset_index()
    country_list = list(df.columns)[1:]
    df = pd.melt(df, id_vars='index', value_vars=country_list)
    df = df.rename(columns={'index':'Date', 'value':name})
    return df

In [16]:
confirmed_df = pd.read_csv('data_csse/time_series_19-covid-Confirmed.csv')
deaths_df = pd.read_csv('data_csse/time_series_19-covid-Deaths.csv')
recovered_df = pd.read_csv('data_csse/time_series_19-covid-Recovered.csv')

In [17]:
confirmed_df = preprocess_df(confirmed_df, 'Confirmed')
deaths_df = preprocess_df(deaths_df, 'Deaths')
recovered_df = preprocess_df(recovered_df, 'Recovered')

In [18]:
confirmed_df

Unnamed: 0,Date,Country/Region,Confirmed
0,1/22/20,Afghanistan,0
1,1/23/20,Afghanistan,0
2,1/24/20,Afghanistan,0
3,1/25/20,Afghanistan,0
4,1/26/20,Afghanistan,0
...,...,...,...
8985,3/15/20,Zambia,0
8986,3/16/20,Zambia,0
8987,3/17/20,Zambia,0
8988,3/18/20,Zambia,2


In [19]:
confirmed_and_deaths = pd.merge(confirmed_df, deaths_df, how='inner', on=['Date', 'Country/Region'])


In [20]:
grouped_df = pd.merge(confirmed_and_deaths, recovered_df, how='inner', on=['Date', 'Country/Region'])

In [21]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8990 entries, 0 to 8989
Data columns (total 5 columns):
Date              8990 non-null object
Country/Region    8990 non-null object
Confirmed         8990 non-null int64
Deaths            8990 non-null int64
Recovered         8990 non-null int64
dtypes: int64(3), object(2)
memory usage: 421.4+ KB


In [22]:
grouped_df['Active'] = grouped_df['Confirmed']-grouped_df['Deaths']-grouped_df['Recovered']

In [23]:
grouped_df['Date'] = grouped_df['Date'].apply(lambda x: pd.to_datetime(x))

In [24]:
pop_df = pd.read_csv('pop_df.csv')

In [25]:
pop_df['Country/Region'] = pop_df['Country/Region'].apply(lambda x: x.replace('Mainland China', 'China'))

In [26]:
pop_df.drop(columns=['Unnamed: 0'], inplace=True)

In [27]:
grouped_df = pd.merge(grouped_df, pop_df, on='Country/Region')

In [28]:
grouped_df['Confirmed Cases Per 1K'] = grouped_df['Confirmed']/grouped_df['PopTotal']

In [36]:
grouped_df.loc[grouped_df['Country/Region']=='China']

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,PopTotal,Confirmed Cases Per 1K
1392,2020-01-22 00:00:00,China,548,17,28,503,1439323.774,0.000381
1393,2020-01-23 00:00:00,China,643,18,30,595,1439323.774,0.000447
1394,2020-01-24 00:00:00,China,920,26,36,858,1439323.774,0.000639
1395,2020-01-25 00:00:00,China,1406,42,39,1325,1439323.774,0.000977
1396,2020-01-26 00:00:00,China,2075,56,49,1970,1439323.774,0.001442
1397,2020-01-27 00:00:00,China,2877,82,58,2737,1439323.774,0.001999
1398,2020-01-28 00:00:00,China,5509,131,101,5277,1439323.774,0.003827
1399,2020-01-29 00:00:00,China,6087,133,120,5834,1439323.774,0.004229
1400,2020-01-30 00:00:00,China,8141,171,135,7835,1439323.774,0.005656
1401,2020-01-31 00:00:00,China,9802,213,214,9375,1439323.774,0.00681


In [29]:
grouped_df['Country/Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Argentina',
       'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain',
       'Bangladesh', 'Belarus', 'Belgium', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada', 'Chile', 'China',
       'Colombia', 'Congo (Kinshasa)', 'Costa Rica', "Cote d'Ivoire",
       'Croatia', 'Cruise Ship', 'Cuba', 'Cyprus', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia', 'Finland',
       'France', 'Georgia', 'Germany', 'Greece', 'Guyana', 'Honduras',
       'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq',
       'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan',
       'Kuwait', 'Latvia', 'Lebanon', 'Liechtenstein', 'Lithuania',
       'Luxembourg', 'Malaysia', 'Maldives', 'Malta', 'Martinique',
       'Mexico', 'Moldova', 'Monaco', 'Mongolia', 'Morocco', 'Nepal',
       'Netherlands', 'New Zealand', 'Nigeria',

In [30]:
grouped_df.loc[grouped_df['Country/Region']=='China']

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,PopTotal,Confirmed Cases Per 1K
1392,2020-01-22,China,548,17,28,503,1439323.774,0.000381
1393,2020-01-23,China,643,18,30,595,1439323.774,0.000447
1394,2020-01-24,China,920,26,36,858,1439323.774,0.000639
1395,2020-01-25,China,1406,42,39,1325,1439323.774,0.000977
1396,2020-01-26,China,2075,56,49,1970,1439323.774,0.001442
1397,2020-01-27,China,2877,82,58,2737,1439323.774,0.001999
1398,2020-01-28,China,5509,131,101,5277,1439323.774,0.003827
1399,2020-01-29,China,6087,133,120,5834,1439323.774,0.004229
1400,2020-01-30,China,8141,171,135,7835,1439323.774,0.005656
1401,2020-01-31,China,9802,213,214,9375,1439323.774,0.00681


In [31]:
@interact(Country = grouped_df['Country/Region'].sort_values().unique(), Confirmed=True, Active=True, Recovered=True, Deaths=True)
def viz(Country, Confirmed, Active, Recovered, Deaths):
    fig = px.line(title='Cases Over Time')
    if Confirmed:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Confirmed'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Confirmed'], mode='lines', name='Confirmed Cases')
    if Active:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Active'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Active'], mode='lines', name='Active Cases')
    if Recovered:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Recovered'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Recovered'], mode='lines', name='Recovered Cases')
    if Deaths:
        country_df = pd.DataFrame(grouped_df.loc[grouped_df['Country/Region']==Country].groupby('Date')['Deaths'].agg('sum'))
        fig.add_scatter(x = country_df.index, y = country_df['Deaths'], mode='lines', name='Deceased Cases')
    fig.update_xaxes(title='Date', range=[dt.date(2020, 1, 22), dt.datetime.now()])
    fig.show()

interactive(children=(Dropdown(description='Country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

In [32]:
grouped_df['Date'] = grouped_df['Date'].apply(lambda x: str(x))

In [33]:
fig = px.scatter_geo(grouped_df, locations='Country/Region', locationmode='country names',
                     hover_data=['Confirmed'], size='Confirmed', animation_frame='Date')
fig.show()

In [34]:
pickle_out = open('COVID_Hopkins_df.pickle', 'wb')
pickle.dump(grouped_df, pickle_out)
pickle_out.close()