<h2><center>
    Loading In Data
    </center></h2>

In [133]:
from github import Github
import os
import pickle
import base64
import pandas as pd
import datetime as dt
from os import listdir
from os.path import isfile, join
import numpy as np
mypath = 'data_csse/'

In [134]:
with open("github.txt") as myfile:
    firstNlines=myfile.readlines()[0:2]
myfile.close()
g = Github(firstNlines[0].strip(), firstNlines[1])
repo=g.get_repo('CSSEGISandData/COVID-19')
contents = repo.get_contents("")

In [135]:
def get_sha_for_tag(repository, tag):
    branches = repository.get_branches()
    matched_branches = [match for match in branches if match.name == tag]
    if matched_branches:
        return matched_branches[0].commit.sha

    tags = repository.get_tags()
    matched_tags = [match for match in tags if match.name == tag]
    if not matched_tags:
        raise ValueError('No Tag or Branch exists with that name')
    return matched_tags[0].commit.sha

In [136]:
def download_directory(repository, sha, server_path, local_path=mypath):
    contents = repository.get_contents(server_path, ref=sha)
    if not os.path.exists(local_path):
        os.makedirs(local_path)
    for content in contents:
        if content.type == 'dir':
            download_directory(repository, sha, content.path)
        else:
            try:
                path = content.path
                file_content = repository.get_contents(path, ref=sha)
                file_data = base64.b64decode(file_content.content).decode('ascii')
                file_out = open(local_path+content.name, "w")
                file_out.write(local_path+file_data)
                file_out.close()
            except:
                pass

In [137]:
def is_non_zero_file(fpath):
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0

In [138]:
def preprocess_df(df, name):
    df.drop(columns=['data_csse/Province/State', 'Lat', 'Long'], inplace=True)
    df = df.groupby(['Country/Region']).agg('sum')
    df = df.transpose().reset_index()
    country_list = list(df.columns)[1:]
    df = pd.melt(df, id_vars='index', value_vars=country_list)
    df = df.rename(columns={'index':'Date', 'value':name})
    return df

In [139]:
sha = get_sha_for_tag(repo, 'master')
download_directory(repo, sha, 'csse_covid_19_data/csse_covid_19_time_series/')

In [140]:
#onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

# timeseries_files = []
# for file in onlyfiles:
#     if is_non_zero_file(mypath + file) and file[-10:]=='global.csv':
#         timeseries_files.append(mypath + file)

In [141]:
confirmed_df = pd.read_csv('data_csse/time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('data_csse/time_series_covid19_deaths_global.csv')
recovered_df = pd.read_csv('data_csse/time_series_covid19_recovered_global.csv')

confirmed_df = preprocess_df(confirmed_df, 'Confirmed')
deaths_df = preprocess_df(deaths_df, 'Deaths')
recovered_df = preprocess_df(recovered_df, 'Recovered')

<h2><center>
    Preprocessing Data
    </center></h2>

In [142]:
confirmed_and_deaths = pd.merge(confirmed_df, deaths_df, how='inner', on=['Date', 'Country/Region'])
grouped_df = pd.merge(confirmed_and_deaths, recovered_df, how='inner', on=['Date', 'Country/Region'])
grouped_df['Active'] = grouped_df['Confirmed']-grouped_df['Deaths']-grouped_df['Recovered']
grouped_df['Datetime'] = grouped_df['Date'].apply(lambda x: pd.to_datetime(x))

In [143]:
pop_df = pd.read_csv('pop_df4.csv')
pop_df['Country/Region'] = pop_df['Country/Region'].apply(lambda x: x.replace('Mainland China', 'China'))
pop_df.drop(columns=['Unnamed: 0'], inplace=True)

In [144]:
country_list=[('Mainland China', 'China'),
('Hong Kong SAR', 'Hong Kong'),
(['Korea, South', 'Republic of Korea'], 'South Korea'),
('United Kingdom', 'UK'),
(['Taiwan*', 'Taipei and environs'], 'Taiwan'),
('Iran (Islamic Republic of)', 'Iran'),
('Viet Nam', 'Vietnam'),
('Macao SAR', 'Macau'),
('Republic of Ireland', 'Ireland'),
('Czechia', 'Czech Republic'),
('occupied Palestinian territory', 'Palestine'),
('Russian Federation', 'Russia'),
(' Azerbaijan', 'Azerbaijan'),
('Holy See', 'Vatican City'),
('Republic of Moldova', 'Moldova'),
('Saint Martin', 'St. Martin')]

In [145]:
for name in country_list:
    grouped_df['Country/Region']=grouped_df['Country/Region'].replace(name[0],name[1])

In [146]:
if not 'PopTotal' in set(grouped_df.columns):
    grouped_df = pd.merge(grouped_df, pop_df, on='Country/Region')
grouped_df['Confirmed Cases Per 1M'] = grouped_df['Confirmed']*1000/grouped_df['PopTotal']
grouped_df['Country/Region'] = grouped_df['Country/Region'].str.replace('US', 'United States')

In [147]:
grouped_df['New Weekly Cases'] = np.where(grouped_df['Country/Region']==grouped_df['Country/Region'].shift(7),
                                          grouped_df['Confirmed']-grouped_df['Confirmed'].shift(7),grouped_df['Confirmed'])

<h2><center>
    Export Dataframe
    </center></h2>

In [148]:
with open('COVID_Hopkins_df.pickle', 'wb') as pickle_out:
    pickle.dump(grouped_df, pickle_out)

In [149]:
with open('../covid19app/COVID_Hopkins_df.pickle', 'wb') as pickle_out:
    pickle.dump(grouped_df, pickle_out)

In [150]:
grouped_df.tail()

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,Datetime,PopTotal,Confirmed Cases Per 1M,New Weekly Cases
12319,4/4/20,Zimbabwe,9,1,0,8,2020-04-04,14862.927,0.605533,2.0
12320,4/5/20,Zimbabwe,9,1,0,8,2020-04-05,14862.927,0.605533,2.0
12321,4/6/20,Zimbabwe,10,1,0,9,2020-04-06,14862.927,0.672815,3.0
12322,4/7/20,Zimbabwe,11,2,0,9,2020-04-07,14862.927,0.740096,3.0
12323,4/8/20,Zimbabwe,11,3,0,8,2020-04-08,14862.927,0.740096,3.0
