In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pprint import pprint 
import requests
from datetime import datetime, timedelta

from functools import reduce
import time

# Set to True if you want "all the info messages"
debug = True;

timestr = time.strftime("%Y%m%d-%H%M%S")
if(debug):
    print(f'Today $timestr value: "{timestr}"')

baseFolder = '../data/Johns Hopkins/csse_covid_19_time_series/';

Today $timestr value: "20200401-043219"


--------------------------------

## Time Series

--------------------------------

In [2]:
# Ultimate goal: Get data in 3 column format: Date, Country, TotalConfirmedCasesThusFar

# Step 1: Get the raw data - JHU adds a new column for each date
raw_confirmed = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
df = pd.read_csv(raw_confirmed);

# Step 2: Drop Province/State, Lat, and Long
df = df.drop(df.columns[[0, 2, 3]], axis=1);

# Step 3: Convert dates to invididual rows using melt()
key_columns = df.columns.to_list()[:1]
date_columns = df.columns.to_list()[1:]

df_clean = pd.melt(
    df
    , id_vars=key_columns
    , value_vars=date_columns
    , var_name='Date'
    , value_name='Confirmed'
)

# Step 4: Rename to make easier 
df_clean.columns = ["Country", "Date", "Confirmed"];

if(debug):
    print(df_clean.head());

       Country     Date  Confirmed
0  Afghanistan  1/22/20          0
1      Albania  1/22/20          0
2      Algeria  1/22/20          0
3      Andorra  1/22/20          0
4       Angola  1/22/20          0


In [3]:
# Step 5: group by
dfAggs = df_clean.groupby(['Country', 'Date']).agg({
        'Confirmed': [
            np.sum
        ]
})

# Convert from groupby object to dataframe:
dfAggs = dfAggs.reset_index(level=['Country', 'Date'])

# Flatten the index by renaming the columns
dfAggs.columns = ["Country", "Date", "Confirmed"];
dfAggs['Date'] = pd.to_datetime(dfAggs['Date']) 
dfAggs['Confirmed'] = pd.to_numeric(dfAggs['Confirmed'], downcast='integer')

if(debug):
    print(dfAggs.head());


       Country       Date  Confirmed
0  Afghanistan 2020-01-22          0
1  Afghanistan 2020-01-23          0
2  Afghanistan 2020-01-24          0
3  Afghanistan 2020-01-25          0
4  Afghanistan 2020-01-26          0


In [4]:
# Normalize "Day 1" as first day that country had 100 confirmed cases:
# Step 1: Remove all rows until a country has at least 100 rows
dfFiltered = dfAggs[dfAggs["Confirmed"] >= 100]
dfFiltered = dfFiltered.sort_values(["Country", "Date"]);
#dfFiltered = dfFiltered.set_index("Country", "Date");

if(debug):
    print(dfFiltered[dfFiltered["Country"] == 'US'].head());

      Country       Date  Confirmed
11821      US 2020-03-03        118
11824      US 2020-03-04        149
11825      US 2020-03-05        217
11826      US 2020-03-06        262
11827      US 2020-03-07        402


In [5]:
# Step 2: Add another layer of aggregation - cumulative count - so that we can sequentially
#         order each row (i.e. "Which date was Day 1 for each country?")
dfFiltered['DayNum'] = dfFiltered.groupby('Country').cumcount() + 1;
if(debug):
    print(dfFiltered[dfFiltered["Country"] == 'US']);

      Country       Date  Confirmed  DayNum
11821      US 2020-03-03        118       1
11824      US 2020-03-04        149       2
11825      US 2020-03-05        217       3
11826      US 2020-03-06        262       4
11827      US 2020-03-07        402       5
11828      US 2020-03-08        518       6
11829      US 2020-03-09        583       7
11800      US 2020-03-10        959       8
11801      US 2020-03-11       1281       9
11802      US 2020-03-12       1663      10
11803      US 2020-03-13       2179      11
11804      US 2020-03-14       2727      12
11805      US 2020-03-15       3499      13
11806      US 2020-03-16       4632      14
11807      US 2020-03-17       6421      15
11808      US 2020-03-18       7783      16
11809      US 2020-03-19      13677      17
11811      US 2020-03-20      19100      18
11812      US 2020-03-21      25489      19
11813      US 2020-03-22      33276      20
11814      US 2020-03-23      43847      21
11815      US 2020-03-24      53

### Target dataframe column format
1. 2000-01-01, 'Coca-Cola', '72537'
2. 2000-01-01, 'Microsoft', '70196'

### Target json data format: 
1. {date: 2000-01-01, name: "Coca-Cola", category: "Beverages", value: 72537}
2. {date: 2000-01-01, name: "Microsoft", category: "Technology", value: 70196}

In [6]:
countries_to_view = ['US', 'Canada', 'Brazil', 'Spain', 'Mexico', 'India', 'China', 'Iran']

mask = dfFiltered['Country'].isin(countries_to_view)
dfFiltered.sort_values(['Country'])
print(dfFiltered[mask]);
# orient 'records' creates one element per row w no index value 
print(dfFiltered[mask].head().to_json(orient='records', date_format='iso', date_unit='s'));

      Country       Date  Confirmed  DayNum
1653   Brazil 2020-03-13        151       1
1654   Brazil 2020-03-14        151       2
1655   Brazil 2020-03-15        162       3
1656   Brazil 2020-03-16        200       4
1657   Brazil 2020-03-17        321       5
1658   Brazil 2020-03-18        372       6
1659   Brazil 2020-03-19        621       7
1661   Brazil 2020-03-20        793       8
1662   Brazil 2020-03-21       1021       9
1663   Brazil 2020-03-22       1546      10
1664   Brazil 2020-03-23       1924      11
1665   Brazil 2020-03-24       2247      12
1666   Brazil 2020-03-25       2554      13
1667   Brazil 2020-03-26       2985      14
1668   Brazil 2020-03-27       3417      15
1669   Brazil 2020-03-28       3904      16
1670   Brazil 2020-03-29       4256      17
1672   Brazil 2020-03-30       4579      18
1673   Brazil 2020-03-31       5717      19
2281   Canada 2020-03-11        108       1
2282   Canada 2020-03-12        117       2
2283   Canada 2020-03-13        

In [8]:
# Get the most recent date:
maxdate = dfFiltered["Date"].max().strftime("%Y%m%d-%H%M%S");
print(f'Data as of {maxdate}');

# Export to file
jsonFile = 'timeseries_' + str(maxdate) + '.json';
finalFile = baseFolder + jsonFile;
dfFiltered[mask].to_json(finalFile, orient='records', date_format='iso', date_unit='s');
print(f'Final file written to {finalFile}');

Data as of 20200331-000000
Final file written to ../data/Johns Hopkins/csse_covid_19_time_series/timeseries_20200331-000000.json


# Done - now go use the json file in d3