In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pprint import pprint 
import requests
from datetime import datetime, timedelta

from functools import reduce
import time

# Set to True if you want "all the info messages"
debug = False;

baseFolder = '../data/Johns Hopkins/csse_covid_19_daily_reports/';

--------------------------------

## Daily Reports - example of how to get the most recent CSV

--------------------------------

In [2]:
baseUrl = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"

# Step 1: Get the latest file
todaysDate = time.strftime("%m-%d-%Y")
todaysFile = todaysDate + ".csv"
dataUrl = baseUrl + todaysFile
finalFile = dataUrl;
shortFile = todaysFile;

if(debug):
    print(f'Trying "{todaysFile}"...');
    
response = requests.head(dataUrl, timeout=5);
status_code = response.status_code;
reason = response.reason;

if(status_code != 200):
    if(debug):
        print('.... error - 404 not found. Looking for yesterday file');

    days_to_subtract = 1;
    yestDate = datetime.today() - timedelta(days=days_to_subtract);
    yestDate = yestDate.strftime("%m-%d-%Y");
    yestFile = yestDate + ".csv";
    dataUrl = baseUrl + yestFile;
    shortFile = yestFile;

    if(debug):
        print(f'Trying "{yestFile}"...');
    response = requests.head(dataUrl, timeout=5);
    status_code = response.status_code;
    reason = response.reason;
    
    if(status_code != 200):
        finalFile = "";
        if(debug):
            print(f'... Error! Unable to find yesterday file also');
    else:
        finalFile = dataUrl;
        if(debug):
            print('... Successfully found yesterday file');

if(debug):
    print('---------------------------------------');
print(f'File found: {shortFile}');
if(debug):
    print('---------------------------------------');

df = pd.read_csv(finalFile);

# Get rid of columns we don't want:
df = df.drop(df.columns[[0, 1, 2, 5, 6, 8, 9, 10, 11]], axis=1)

# Rename to make easier
df.columns = ["Country", "Date", "ConfirmedCases"];

# Convert from string "2020-03-26 23:48:35" to just the date only
df['Date'] = pd.to_datetime(df['Date'], errors='raise')
df['Date'] = df['Date'].dt.date

File found: 03-31-2020.csv


In [3]:
# Get the most recent date:
maxdate = df["Date"].max();
print(f'Data as of {maxdate}');

# Delete rows older than this - China, in particular, has "extra" rows
start_rows = df.size;
df = df[(df['Date']>= maxdate)];
before_groupby_rows = df.size;

if(debug):
    print(f'   - original rows: {start_rows}');
    print(f'   - after removing "old" rows: {before_groupby_rows}');
    df[df["Country"] == 'China']; # 3/31 - I swear! I saw some "weird" data...

Data as of 2020-03-31


In [4]:
# Ultimate goal: Get data in 3 column format: Date, Country, ConfirmedCases

# Group by Country
dfAggs = df.groupby(['Country', 'Date']).agg({
        'ConfirmedCases': [
            np.sum
        ]
})

# Convert from groupby object to dataframe:
dfAggs = dfAggs.reset_index(level=['Country', 'Date'])

# Flatten the index by renaming the columns
dfAggs.columns = ["Country", "Date", "ConfirmedCases"];

countries_to_view = ['US', 'Canada', 'Brazil', 'Spain', 'Mexico', 'India', 'China', 'Iran']
    
# Create a mask:
mask = dfAggs['Country'].isin(countries_to_view)
dfAggs.sort_values(['Country'])
print(dfAggs[mask]);

    Country        Date  ConfirmedCases
23   Brazil  2020-03-31            5717
32   Canada  2020-03-31            8527
36    China  2020-03-31           74654
78    India  2020-03-31            1397
80     Iran  2020-03-31           44605
110  Mexico  2020-03-31            1094
153   Spain  2020-03-31           95923
168      US  2020-03-31          188172


In [5]:
# orient 'records' creates one element per row w no index value 
print(dfAggs[mask].head().to_json(orient='records', date_format='iso', date_unit='s'));

[{"Country":"Brazil","Date":"2020-03-31T00:00:00Z","ConfirmedCases":5717},{"Country":"Canada","Date":"2020-03-31T00:00:00Z","ConfirmedCases":8527},{"Country":"China","Date":"2020-03-31T00:00:00Z","ConfirmedCases":74654},{"Country":"India","Date":"2020-03-31T00:00:00Z","ConfirmedCases":1397},{"Country":"Iran","Date":"2020-03-31T00:00:00Z","ConfirmedCases":44605}]


In [6]:
# Export to file
jsonFile = 'confirmedCases_' + str(maxdate) + '.json';
finalFile = baseFolder + jsonFile;
dfAggs[mask].to_json(finalFile, orient='records', date_format='iso', date_unit='s');
print(f'Final file written to {finalFile}');

# https://jsonformatter.curiousconcept.com/

Final file written to ../data/Johns Hopkins/csse_covid_19_daily_reports/confirmedCases_2020-03-31.json


# Done - now go use the json file in d3