### John Hopkins GitHub Repo Data Wrangling

In [32]:
import json
import os
import sys
import pandas as pd
import numpy as np
import boto3
import uuid
from slugify import slugify

In [33]:
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__name__))), 'COVID-19')

In [34]:
# the confirmed cases time series files in here have Lat and Long for each location
confirmed_series_file = os.path.join(
    DATA_DIR,
    'csse_covid_19_data',
    'csse_covid_19_time_series',
    'time_series_19-covid-Confirmed.csv'
)
confirmed_series_file

'/Users/adammcquistan/Code/python/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv'

In [35]:
# show first 10 rows of time_series_19-covid-Confirmed.csv file
#!awk -F, '{print $1,$2,$3,$4} NR==10{exit}' OFS=', ' \
# COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv

awk: can't open file COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv
 source line number 1


In [36]:
cols=['Province/State', 'Country/Region', 'Lat', 'Long']
locations_df = pd.read_csv(confirmed_series_file, usecols=cols)
locations_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long
0,,Thailand,15.0,101.0
1,,Japan,36.0,138.0
2,,Singapore,1.2833,103.8333
3,,Nepal,28.1667,84.25
4,,Malaysia,2.5,112.5


In [37]:
s3_resource = boto3.resource('s3')
bucket_name = 'thecodinginterface-covid'
s3_bucket = s3_resource.Bucket(name=bucket_name)

In [38]:
def slugify_location(country_region, province_state):
    if province_state:
        return slugify(f"{country_region}-{province_state}")
    return slugify(country_region)

def cloud_resource_url(filename, bucket_name):
    return f"https://{bucket_name}.s3.amazonaws.com/{filename}.json"

def upload_file_to_s3(s3_bucket, file_path, file_name):
    s3_bucket.upload_file(
        Filename=file_path,
        Key=file_name,
        ExtraArgs={'ACL':'public-read'}
    )
    return cloud_resource_url(file_name, s3_bucket.name)

In [39]:
# rename columns to be snake_cased making it more ammenable to serialization
locations_df = locations_df.rename(columns={
    'Province/State': 'province_state',
    'Country/Region': 'country_region',
    'Lat': 'lat',
    'Long': 'long'
})

# make sure text columns are well cleaned and stripped of whitespace
locations_df.province_state = locations_df.province_state.str.strip()
locations_df.country_region = locations_df.country_region.str.strip()

# Fill NaNs with empty strings in the Province/State columns because this data will
# be serialized into JSON which does not support NaN
locations_df.province_state = locations_df.province_state.fillna('')

# create columns "filename" and "cloud_resource"
lookup_keys = zip(locations_df.country_region, locations_df.province_state)
locations_df['location_id'] = [slugify_location(country_region, province_state)
                            for country_region, province_state in lookup_keys]

locations_df['cloud_resource'] = [cloud_resource_url(filename, bucket_name)
                                  for filename in locations_df['location_id'].values]

locations_df.head()

Unnamed: 0,province_state,country_region,lat,long,location_id,cloud_resource
0,,Thailand,15.0,101.0,thailand,https://thecodinginterface-covid.s3.amazonaws....
1,,Japan,36.0,138.0,japan,https://thecodinginterface-covid.s3.amazonaws....
2,,Singapore,1.2833,103.8333,singapore,https://thecodinginterface-covid.s3.amazonaws....
3,,Nepal,28.1667,84.25,nepal,https://thecodinginterface-covid.s3.amazonaws....
4,,Malaysia,2.5,112.5,malaysia,https://thecodinginterface-covid.s3.amazonaws....


In [40]:
locations_df = locations_df.set_index('location_id')
locations_df[locations_df.country_region == 'US'].sort_values('province_state')

Unnamed: 0_level_0,province_state,country_region,lat,long,cloud_resource
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
us-adams-in,"Adams, IN",US,39.8522,-77.2865,https://thecodinginterface-covid.s3.amazonaws....
us-alabama,Alabama,US,32.3182,-86.9023,https://thecodinginterface-covid.s3.amazonaws....
us-alachua-fl,"Alachua, FL",US,29.7938,-82.4944,https://thecodinginterface-covid.s3.amazonaws....
us-alameda-county-ca,"Alameda County, CA",US,37.6017,-121.7195,https://thecodinginterface-covid.s3.amazonaws....
us-alaska,Alaska,US,61.3707,-152.4044,https://thecodinginterface-covid.s3.amazonaws....
...,...,...,...,...,...
us-williamson-county-tn,"Williamson County, TN",US,35.9179,-86.8622,https://thecodinginterface-covid.s3.amazonaws....
us-wisconsin,Wisconsin,US,44.2685,-89.6165,https://thecodinginterface-covid.s3.amazonaws....
us-worcester-ma,"Worcester, MA",US,42.4097,-71.8571,https://thecodinginterface-covid.s3.amazonaws....
us-wyoming,Wyoming,US,42.7560,-107.3025,https://thecodinginterface-covid.s3.amazonaws....


In [41]:
# I'll do more with this locations_df DataFrame later after
# constructing country specific case data sets

In [42]:
# build list of daily csv files
confirmed_series_dir = os.path.join(
    DATA_DIR,
    'csse_covid_19_data',
    'csse_covid_19_daily_reports'
)
daily_csv_files = [file_name
                   for file_name in os.listdir(confirmed_series_dir) 
                   if file_name.endswith('csv')]
daily_csv_files[:5]

['02-26-2020.csv',
 '02-27-2020.csv',
 '02-18-2020.csv',
 '02-19-2020.csv',
 '03-24-2020.csv']

In [43]:
# take a peek at the structure of a file that will be worked with
os.path.join(confirmed_series_dir, daily_csv_files[0])

'/Users/adammcquistan/Code/python/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/02-26-2020.csv'

In [44]:
#!head ./COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/02-26-2020.csv

head: ./COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/02-26-2020.csv: No such file or directory


In [45]:
def calc_differential(x):
    x0 = np.array([0] + x[:-1].tolist())
    dx = x.values - x0
    return dx

In [46]:
# read the daily files into DataFrame objects then concatenate them together
daily_dfs = []
colunns_of_interest = [
    'province_state',
    'country_region',
    'total_confirmed',
    'total_deaths',
    'total_recovered',
    'date'
]

for file_name in daily_csv_files:
    file_path = os.path.join(confirmed_series_dir, file_name)
    day_df = pd.read_csv(file_path)

    # Province_State and Country_Region replaced column names Province/State
    # and Country/Region for new daily files starting 03-24-2020
    day_df = day_df.rename(columns={
        'Province/State': 'province_state',
        'Province_State': 'province_state', 
        'Country/Region': 'country_region',
        'Country_Region': 'country_region',
        'Confirmed': 'total_confirmed',
        'Deaths':'total_deaths',
        'Recovered': 'total_recovered'
    })
    
    date_str, ext = os.path.splitext(file_name)
    num_rows = day_df.shape[0]
    day_df['date'] = [pd.to_datetime(date_str)] * num_rows
    
    missing_columns = sum([(col not in day_df.columns) for col in colunns_of_interest])
    if missing_columns:
        import pdb; pdb.set_trace()
        sys.exit(0)
        
    # increased granularity by neighborhood was added in Admin2 column 03-24-2020
    # but only want granularity down to province_region so collapse down and aggregate
    day_df = day_df[colunns_of_interest]
    day_df = day_df.groupby(['country_region', 'province_state', 'date']).sum()
    day_df = day_df.reset_index()
    
    daily_dfs.append(day_df[colunns_of_interest])
    
daily_df = pd.concat(daily_dfs)

# Fill NaNs with empty strings because this data will
# be serialized into JSON which does not support NaN
daily_df.province_state = daily_df.province_state.fillna('')
daily_df.total_confirmed = daily_df.total_confirmed.fillna(0)
daily_df.total_deaths = daily_df.total_deaths.fillna(0)
daily_df.total_recovered = daily_df.total_recovered.fillna(0)

# make sure text columns are well cleaned and stripped of whitespace
daily_df.province_state = daily_df.province_state.str.strip()
daily_df.country_region = daily_df.country_region.str.strip()
    
locations = zip(daily_df.country_region.values, daily_df.province_state.values)
daily_df['location_id'] = [slugify_location(country_region, province_state)
                           for country_region, province_state in locations]

# sort by country_region, province_state, date
daily_df = daily_df.sort_values(['country_region', 'province_state', 'date'])
daily_df.head()

Unnamed: 0,province_state,country_region,total_confirmed,total_deaths,total_recovered,date,location_id
0,Australian Capital Territory,Australia,1.0,0.0,0.0,2020-03-13,australia-australian-capital-territory
0,Australian Capital Territory,Australia,1.0,0.0,0.0,2020-03-14,australia-australian-capital-territory
0,Australian Capital Territory,Australia,1.0,0.0,0.0,2020-03-15,australia-australian-capital-territory
0,Australian Capital Territory,Australia,2.0,0.0,0.0,2020-03-16,australia-australian-capital-territory
0,Australian Capital Territory,Australia,2.0,0.0,0.0,2020-03-17,australia-australian-capital-territory


In [47]:
# get totals per location
max_date = daily_df.date.max()
rows_of_interest = daily_df.date == max_date
columns_of_interest = [
    'location_id',
    'total_confirmed',
    'total_deaths',
    'total_recovered'
]
location_totals_df = daily_df.loc[rows_of_interest, columns_of_interest]
location_totals_df = location_totals_df.groupby('location_id').sum()
location_totals_df['death_rate'] = location_totals_df.total_deaths / location_totals_df.total_confirmed * 100
location_totals_df['recovery_rate'] = location_totals_df.total_recovered / location_totals_df.total_confirmed * 100
location_totals_df.sort_values('total_confirmed', ascending=False).head(10)

Unnamed: 0_level_0,total_confirmed,total_deaths,total_recovered,death_rate,recovery_rate
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
china-hubei,67801.0,3160.0,60324.0,4.660698,88.972139
us-new-york,25681.0,210.0,0.0,0.817725,0.0
us-new-jersey,3675.0,44.0,0.0,1.197279,0.0
us-california,2538.0,50.0,0.0,1.970055,0.0
us-washington,2328.0,116.0,0.0,4.982818,0.0
us-michigan,1793.0,24.0,0.0,1.338539,0.0
us-illinois,1537.0,16.0,0.0,1.040989,0.0
china-guangdong,1428.0,8.0,1333.0,0.560224,93.347339
us-florida,1412.0,18.0,0.0,1.274788,0.0
us-louisiana,1388.0,46.0,0.0,3.314121,0.0


In [48]:
# get totals per country / region
country_totals_df = locations_df.join(location_totals_df)
columns_of_interest = [
    'country_region',
    'total_confirmed',
    'total_deaths',
    'total_recovered',
]
country_totals_df = country_totals_df.reset_index()
country_totals_df = country_totals_df[columns_of_interest].groupby('country_region').sum()
country_totals_df['death_rate'] = country_totals_df.total_deaths / country_totals_df.total_confirmed * 100
country_totals_df['recovery_rate'] = country_totals_df.total_recovered / country_totals_df.total_confirmed * 100
country_totals_df.sort_values('country_region').head(25)

Unnamed: 0_level_0,total_confirmed,total_deaths,total_recovered,death_rate,recovery_rate
country_region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0.0,0.0,0.0,,
Albania,0.0,0.0,0.0,,
Algeria,0.0,0.0,0.0,,
Andorra,0.0,0.0,0.0,,
Angola,0.0,0.0,0.0,,
Antigua and Barbuda,0.0,0.0,0.0,,
Argentina,0.0,0.0,0.0,,
Armenia,0.0,0.0,0.0,,
Australia,2044.0,8.0,119.0,0.391389,5.821918
Austria,0.0,0.0,0.0,,


In [49]:
world_population_df = pd.read_csv('world_population.csv')
world_population_df = world_population_df.set_index('country_region')
world_population_df.head()

Unnamed: 0_level_0,population,population_density,land_area,world_share
country_region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,1439323776,153,9388211,18.47
India,1380004385,464,2973190,17.7
US,331002651,36,9147420,4.25
Indonesia,273523615,151,1811570,3.51
Pakistan,220892340,287,770880,2.83


In [50]:
# add population data to country totals
country_totals_df = country_totals_df.join(world_population_df)
country_totals_df.head(20)

Unnamed: 0_level_0,total_confirmed,total_deaths,total_recovered,death_rate,recovery_rate,population,population_density,land_area,world_share
country_region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,0.0,0.0,0.0,,,38928346.0,60.0,652860.0,0.5
Albania,0.0,0.0,0.0,,,2877797.0,105.0,27400.0,0.04
Algeria,0.0,0.0,0.0,,,43851044.0,18.0,2381740.0,0.56
Andorra,0.0,0.0,0.0,,,77265.0,164.0,470.0,0.0
Angola,0.0,0.0,0.0,,,32866272.0,26.0,1246700.0,0.42
Antigua and Barbuda,0.0,0.0,0.0,,,97929.0,223.0,440.0,0.0
Argentina,0.0,0.0,0.0,,,45195774.0,17.0,2736690.0,0.58
Armenia,0.0,0.0,0.0,,,2963243.0,104.0,28470.0,0.04
Australia,2044.0,8.0,119.0,0.391389,5.821918,25499884.0,3.0,7682300.0,0.33
Austria,0.0,0.0,0.0,,,9006398.0,109.0,82409.0,0.12


In [51]:
daily_df.head()

Unnamed: 0,province_state,country_region,total_confirmed,total_deaths,total_recovered,date,location_id
0,Australian Capital Territory,Australia,1.0,0.0,0.0,2020-03-13,australia-australian-capital-territory
0,Australian Capital Territory,Australia,1.0,0.0,0.0,2020-03-14,australia-australian-capital-territory
0,Australian Capital Territory,Australia,1.0,0.0,0.0,2020-03-15,australia-australian-capital-territory
0,Australian Capital Territory,Australia,2.0,0.0,0.0,2020-03-16,australia-australian-capital-territory
0,Australian Capital Territory,Australia,2.0,0.0,0.0,2020-03-17,australia-australian-capital-territory


In [52]:
# group by location and serialize each location dataset to a json file
# [ 
#   {
#     date: str,
#     province_state: str,
#     confirmed: int,
#     deaths: int,
#     recovered: int
#   }, ...
# ]

location_case_data = 'location_case_data'
if not os.path.exists(location_case_data):
    os.mkdir(location_case_data)

location_groups = daily_df.groupby(['location_id'])
for location_id, location_data in location_groups:
    location_data.loc[:,'daily_confirmed'] = calc_differential(location_data.total_confirmed)
    location_data.loc[:,'daily_deaths'] = calc_differential(location_data.total_deaths)
    location_data.loc[:,'daily_recovered'] = calc_differential(location_data.total_recovered)
    
    location_days = []

    for idx, row in location_data.iterrows():
        data = row.to_dict()
        # dates don't serialize well in Python so, convert to strings
        data['date'] = data['date'].strftime('%Y-%m-%d')
        location_days.append(data)

    filename = f"{location_id}.json"
    file_path = os.path.join(location_case_data, filename)

    with open(file_path, 'w') as fo:
        json.dump(location_days, fo, indent=4)

    s3_url = upload_file_to_s3(s3_bucket, file_path, filename)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [53]:
locations_df.head(25)
locations_df = locations_df.reset_index()
locations_df.head()

Unnamed: 0,location_id,province_state,country_region,lat,long,cloud_resource
0,thailand,,Thailand,15.0,101.0,https://thecodinginterface-covid.s3.amazonaws....
1,japan,,Japan,36.0,138.0,https://thecodinginterface-covid.s3.amazonaws....
2,singapore,,Singapore,1.2833,103.8333,https://thecodinginterface-covid.s3.amazonaws....
3,nepal,,Nepal,28.1667,84.25,https://thecodinginterface-covid.s3.amazonaws....
4,malaysia,,Malaysia,2.5,112.5,https://thecodinginterface-covid.s3.amazonaws....


In [54]:
# create a list of dicts in the form:
# [ 
#   {
#     country_region: str,
#     province_state: str,
#     lat: float,
#     long: float,
#     filename: str,
#     cloud_resource: str
#   },
#    ...
# ]
locations = []
location_groups = locations_df.groupby(['location_id'])
for k, location_data in location_groups:
    for i, row in location_data.iterrows():
        data = row.to_dict()
        locations.append(data)
        if i < 5:
            print(data)

{'location_id': 'japan', 'province_state': '', 'country_region': 'Japan', 'lat': 36.0, 'long': 138.0, 'cloud_resource': 'https://thecodinginterface-covid.s3.amazonaws.com/japan.json'}
{'location_id': 'malaysia', 'province_state': '', 'country_region': 'Malaysia', 'lat': 2.5, 'long': 112.5, 'cloud_resource': 'https://thecodinginterface-covid.s3.amazonaws.com/malaysia.json'}
{'location_id': 'nepal', 'province_state': '', 'country_region': 'Nepal', 'lat': 28.1667, 'long': 84.25, 'cloud_resource': 'https://thecodinginterface-covid.s3.amazonaws.com/nepal.json'}
{'location_id': 'singapore', 'province_state': '', 'country_region': 'Singapore', 'lat': 1.2833, 'long': 103.8333, 'cloud_resource': 'https://thecodinginterface-covid.s3.amazonaws.com/singapore.json'}
{'location_id': 'thailand', 'province_state': '', 'country_region': 'Thailand', 'lat': 15.0, 'long': 101.0, 'cloud_resource': 'https://thecodinginterface-covid.s3.amazonaws.com/thailand.json'}


In [55]:
# serialize locations to JSON file
with open('locations.json', 'w') as fo:
    json.dump(locations, fo, indent=4)

#!head -n 15 locations.json

[
    {
        "location_id": "afghanistan",
        "province_state": "",
        "country_region": "Afghanistan",
        "lat": 33.0,
        "long": 65.0,
        "cloud_resource": "https://thecodinginterface-covid.s3.amazonaws.com/afghanistan.json"
    },
    {
        "location_id": "albania",
        "province_state": "",
        "country_region": "Albania",
        "lat": 41.1533,
        "long": 20.1683,


## Country / Region Dashboard

Give user ability to select (aka drill down) into country, region, state, province

Show confirmed, deaths, recovered

Show time series of total confirmed

Show time series of total recovered

Show time series of total deaths

Show time series of daily new confirmed

Show time series of daily new recovered

Show time series of daily new deaths

Would be interesting to give a Gauge chart next to the daily graphs with an indicator of direction of n day movement (ie, over the last three days is new daily cases (deaths, confirmed, recovered) increasing, descreasing, maintaining)


## Location Comparisons

### Barcharts

Death Rates: select locations to include and date in time (includes checkbox to make percent of population)

Confirmed Counts: select locations to include and date in time (includes checkbox to make percent of population)


### Line Charts

Total Confirmed Cases: select locations to include and plot progression of cases since first case (includes checkbox to make percent of population)

New Daily Confirmed Cases: select locations to include and plot progression of new cases since first case in each location (includes checkbox to make percent of population)

Total Deaths: select locations to include and plot progression of deaths since first case (includes checkbox to make percent of population)

New Daily Deaths: select locations to include and plot progression of deaths since first  case in each location (includes checkbox to make percent of population)

In [23]:
locations_df.to_csv('locations.csv')