# Outline:
* [Configuration](#config)
* [Functions](#functions)
* [Loading-data](#loading-data)

# Configuration <a class='anchor' id='config'></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


url_csse_covid_19_daily_reports = ("https://github.com/CSSEGISandData/COVID-19/tree/master/"
                                   "csse_covid_19_data/csse_covid_19_daily_reports")
url_csse_covid_19_daily_reports_us = ("https://github.com/CSSEGISandData/COVID-19/tree/master/"
                                      "csse_covid_19_data/csse_covid_19_daily_reports_us")

url_time_series_covid19_confirmed_global = ("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
                        "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
url_time_series_covid19_deaths_global = ("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
                        "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
url_time_series_covid19_recovered_global = ("https://github.com/CSSEGISandData/COVID-19/raw/master/"
 "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")

url_time_series_covid19_confirmed_US = ("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
                        "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
url_time_series_covid19_deaths_US = ("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
                        "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")

# functions <a class='anchor' id='functions'></a>

In [2]:
def format_columns(df):     
    df = df.rename(columns={'Country/Region':'Country_Region', 'Province/State': 'Province_State', 'Long_': 'Long'})
    dates = {}
    names = {}
    for col in df.columns:
        try:
            dates[col] = pd.to_datetime(col).date()
        except:
            names[col] = str.lower(col)
    df = df.rename(columns=names).rename(columns=dates)
    cols_order = list(names.values()) + sorted(dates.values(), reverse=True)
    return df[cols_order]

def clean_up(df):
    df = df.copy()
    if 'population' in df.columns:
        df.loc[df['population'] == 0, 'population'] = np.nan
    return df

def sparse_columns(df, sparsity_level=0.5):
    return df.columns[df.isna().sum() > df.shape[0] * sparsity_level].values

# Loading data

### fetch time series

In [3]:
df_global_confirmed = pd.read_csv(url_time_series_covid19_confirmed_global)
df_global_deaths = pd.read_csv(url_time_series_covid19_deaths_global)
df_global_recovered = pd.read_csv(url_time_series_covid19_recovered_global)
df_us_confirmed = pd.read_csv(url_time_series_covid19_confirmed_US)
df_us_deaths= pd.read_csv(url_time_series_covid19_deaths_US)

df_global_confirmed = clean_up(format_columns(df_global_confirmed))
df_global_deaths = clean_up(format_columns(df_global_deaths))
df_global_recovered = clean_up(format_columns(df_global_recovered))
df_us_confirmed = clean_up(format_columns(df_us_confirmed))
df_us_deaths = clean_up(format_columns(df_us_deaths))

print('columns matched' if 
      df_global_confirmed.columns.symmetric_difference(df_global_deaths.columns).shape[0] == 0 and
      df_global_confirmed.columns.symmetric_difference(df_global_recovered.columns).shape[0] == 0
     else 'columns did not match')
print('columns matched' 
      if df_us_confirmed.columns.symmetric_difference(df_us_deaths.columns).shape[0] == 0
     else f'columns did not match: ({df_us_confirmed.columns.symmetric_difference(df_us_deaths.columns)})')

print(df_global_confirmed.shape, df_global_deaths.shape, df_global_recovered.shape)
print(df_us_confirmed.shape, df_us_deaths.shape)

columns matched
columns did not match: (Index(['population'], dtype='object'))
(266, 137) (266, 137) (253, 137)
(3261, 144) (3261, 145)


Columns with lots of missing numbers:

In [4]:
sparsity = 0.6
print(f'columns with at lease {sparsity * 100:.0f}% NA values:', sparse_columns(df_global_confirmed, sparsity))
print(f'columns with at lease {sparsity * 100:.0f}% NA values:', sparse_columns(df_global_deaths, sparsity))
print(f'columns with at lease {sparsity * 100:.0f}% NA values:', sparse_columns(df_global_recovered, sparsity))
print(f'columns with at lease {sparsity * 100:.0f}% NA values:', sparse_columns(df_us_confirmed, sparsity))
print(f'columns with at lease {sparsity * 100:.0f}% NA values:', sparse_columns(df_us_deaths, sparsity))

columns with at lease 60% NA values: ['province_state']
columns with at lease 60% NA values: ['province_state']
columns with at lease 60% NA values: ['province_state']
columns with at lease 60% NA values: []
columns with at lease 60% NA values: []


Appending dataframes to make a single one:

In [5]:
assert (('class' not in df_global_confirmed.columns) and 
        ('class' not in df_us_confirmed.columns.union(df_us_deaths.columns))), 'assignment to an existing column'

df_global_confirmed['class'] = 'confirmed'
df_global_deaths['class'] = 'deaths'
df_global_recovered['class'] = 'recovered'
df_global = (df_global_confirmed
             .append(df_global_deaths)
             .append(df_global_recovered)
             .set_index('country_region'))

df_us_confirmed['class'] = 'confirmed'
df_us_deaths['class'] = 'deaths'
df_us = (df_us_confirmed.append(df_us_deaths, sort=False))
df_us['country_region'] = 'US'
df_us = df_us.set_index('country_region')

df = df_us.append(df_global, sort=False)

### fetch daily reports