In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pprint import pprint 

from functools import reduce
import time

timestr = time.strftime("%Y%m%d-%H%M%S")
print(f'Today $timestr value: "{timestr}"')

Today $timestr value: "20200325-064725"


In [2]:
# Ultimate goal: Get data in 3 column format: Date, Country, TotalConfirmedCasesThusFar

# Step 1: Get the raw data - JHU adds a new column for each date
df = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")

# Step 2: Convert date columns to rows
df = df.drop(df.columns[[0, 2, 3]], axis=1)

# Step 3: Convert dates to invididual rows using melt()
key_columns = df.columns.to_list()[:1]
date_columns = df.columns.to_list()[1:]

df_clean = pd.melt(
    df
    , id_vars=key_columns
    , value_vars=date_columns
    , var_name='Date'
    , value_name='Confirmed'
)

# print(df_clean.head())

In [39]:
# Step 3: Group by Country - remove city/province
gbAggs = df_clean.groupby(['Country/Region', 'Date']).agg({
        'Confirmed': [
            np.sum
        ]
})

# Convert from groupby object to dataframe:
gbAggs = gbAggs.reset_index(level=['Country/Region', 'Date'])

# Flatten the index by renaming the columns
gbAggs.columns = ["Country", "Date", "ConfirmedCases"];
gbAggs['Date'] = pd.to_datetime(gbAggs['Date']) 
gbAggs['ConfirmedCases'] = pd.to_numeric(gbAggs['ConfirmedCases'], downcast='integer')
# gbAggs.head()

Unnamed: 0,Country,Date,ConfirmedCases
0,Afghanistan,2020-01-22,0
1,Afghanistan,2020-01-23,0
2,Afghanistan,2020-01-24,0
3,Afghanistan,2020-01-25,0
4,Afghanistan,2020-01-26,0


In [41]:
# Step 4: Remove all rows until a country has at least 100 rows
dfFinal = gbAggs[gbAggs["ConfirmedCases"] >= 100]
dfFinal = dfFinal.sort_values(["Country", "Date"]);
# dfFinal = dfFinal.set_index("Country", "Date");
# dfFinal.head()
# dfFinal[dfFinal["Country"] == 'US'].tail()

Unnamed: 0,Country,Date,ConfirmedCases
176,Algeria,2020-03-21,139
177,Algeria,2020-03-22,201
178,Algeria,2020-03-23,201
239,Andorra,2020-03-22,113
240,Andorra,2020-03-23,113


# Target dataframe column format
1. 2000-01-01, 'Coca-Cola', '72537'
2. 2000-01-01, 'Microsoft', '70196'

# Target json data format: 
1. {date: 2000-01-01, name: "Coca-Cola", category: "Beverages", value: 72537}
2. {date: 2000-01-01, name: "Microsoft", category: "Technology", value: 70196}

In [44]:
# orient 'records' creates one element per row w no index value 
print(dfFinal.head().to_json(orient='records', date_format='iso'))

# Export to file
jsonFile = 'temp.json';
dfFinal.to_json(jsonFile, orient='records');
# https://jsonformatter.curiousconcept.com/

[{"Country":"Algeria","Date":"2020-03-21T00:00:00.000Z","ConfirmedCases":139},{"Country":"Algeria","Date":"2020-03-22T00:00:00.000Z","ConfirmedCases":201},{"Country":"Algeria","Date":"2020-03-23T00:00:00.000Z","ConfirmedCases":201},{"Country":"Andorra","Date":"2020-03-22T00:00:00.000Z","ConfirmedCases":113},{"Country":"Andorra","Date":"2020-03-23T00:00:00.000Z","ConfirmedCases":113}]


# Done - now go use the json file in d3