In [1]:
# Cleaning Raw Data from Johns Hopkins
# By Chris Chiang
import pandas as pd
import datetime as dt
import numpy as np

In [2]:
data_loc = '../raw_data/Johns_Hopkins_day_by_day/'
# Setting up dates
start_date = dt.date(2020, 1, 22)
last_date = dt.date(2020, 3, 27)
# country format changed for the last 6 days of data so need to adjust
days = (last_date - start_date).days

In [3]:
# forloop init
current_date = start_date
combine_df = pd.DataFrame()
# i is the index of combine_df
i = 0

for day in range(int(days)+1):
    # convert date into string and load in csv
    load_string = current_date.strftime("%m-%d-%Y")
    try:
        today_df = pd.read_csv(data_loc + load_string +
                               '.csv').groupby('Country/Region')
    except:
        # deal with data column format change
        today_df = pd.read_csv(data_loc + load_string +
                               '.csv').groupby('Country_Region')
    # total count init
    day_deaths_tot = 0
    day_confirmed_tot = 0
    # Loop through intrested countries
    for needed in ['US', 'Mainland China', 'South Korea', 'Italy', 'China', 'Korea, South', 'Republic of Korea']:
        try:
            needed_df = today_df.get_group(needed)
            day_deaths_tot += needed_df['Deaths'].sum()
            day_confirmed_tot += needed_df['Confirmed'].sum()
            # write rows base on country
            # handle all South Korea formats
            if needed in ['South Korea', 'Korea, South', 'Republic of Korea']:
                row = pd.DataFrame({'Date': current_date, 'Country/Region': 'South Korea',
                                    'Confirmed': needed_df['Confirmed'].sum(), 'Deaths': needed_df['Deaths'].sum()}, index=[i])
            # handle All China formats
            elif needed in ['China', 'Mainland China']:
                row = pd.DataFrame({'Date': current_date, 'Country/Region': 'Mainland China',
                                    'Confirmed': needed_df['Confirmed'].sum(), 'Deaths': needed_df['Deaths'].sum()}, index=[i])
            else:
                # Italy and US rows
                row = pd.DataFrame({'Date': current_date, 'Country/Region': needed,
                                    'Confirmed': needed_df['Confirmed'].sum(), 'Deaths': needed_df['Deaths'].sum()}, index=[i])
            # add row into full df
            combine_df = pd.concat([row, combine_df])
            i += 1
        except KeyError:
            # catch missing data
            # No Italy Data for first few days so store 0
            if needed == 'Italy':
                row = pd.DataFrame(
                    {'Date': current_date, 'Country/Region': needed, 'Confirmed': 0, 'Deaths': 0}, index=[i])
                combine_df = pd.concat([row, combine_df])
                i += 1
            else:
                print(f'{needed} data not found on {current_date}')
        # add total row
    row = pd.DataFrame({'Date': current_date, 'Country/Region': 'Total',
                        'Confirmed': day_confirmed_tot, 'Deaths': day_deaths_tot}, index=[i])
    combine_df = pd.concat([row, combine_df])
    i += 1
    # increment date
    current_date = current_date + dt.timedelta(days=1)
combine_df

China data not found on 2020-01-22
Korea, South data not found on 2020-01-22
Republic of Korea data not found on 2020-01-22
China data not found on 2020-01-23
Korea, South data not found on 2020-01-23
Republic of Korea data not found on 2020-01-23
China data not found on 2020-01-24
Korea, South data not found on 2020-01-24
Republic of Korea data not found on 2020-01-24
China data not found on 2020-01-25
Korea, South data not found on 2020-01-25
Republic of Korea data not found on 2020-01-25
China data not found on 2020-01-26
Korea, South data not found on 2020-01-26
Republic of Korea data not found on 2020-01-26
China data not found on 2020-01-27
Korea, South data not found on 2020-01-27
Republic of Korea data not found on 2020-01-27
China data not found on 2020-01-28
Korea, South data not found on 2020-01-28
Republic of Korea data not found on 2020-01-28
China data not found on 2020-01-29
Korea, South data not found on 2020-01-29
Republic of Korea data not found on 2020-01-29
China da

Unnamed: 0,Date,Country/Region,Confirmed,Deaths
331,2020-03-27,Total,279384.0,14150.0
330,2020-03-27,South Korea,9332.0,139.0
329,2020-03-27,Mainland China,81897.0,3296.0
328,2020-03-27,Italy,86498.0,9134.0
327,2020-03-27,US,101657.0,1581.0
...,...,...,...,...
4,2020-01-22,Total,549.0,17.0
3,2020-01-22,Italy,0.0,0.0
2,2020-01-22,South Korea,1.0,0.0
1,2020-01-22,Mainland China,547.0,17.0


In [4]:
combine_df.to_csv('../cleaned_data/cases.csv', index=False)

In [5]:
# just some testing script below

# testing for missing data rows
start_date = dt.date(2020, 1, 22)
current_date = start_date
for day in range(int(days)+1):
    today_df = combine_df.groupby('Date').get_group(current_date)
    day_sum = 0
    for korea in ['South Korea']:
        day_sum += today_df.groupby('Country/Region').get_group(korea)[
            'Confirmed'].sum()
    if day_sum == 0:
        print(f'{current_date} no korea')
    day_sum = 0
    for korea in ['Italy']:
        day_sum += today_df.groupby('Country/Region').get_group(korea)[
            'Confirmed'].sum()
    if day_sum == 0:
        print(f'{current_date} no Italy')
    day_sum = 0
    for korea in ['US']:
        day_sum += today_df.groupby('Country/Region').get_group(korea)[
            'Confirmed'].sum()
    if day_sum == 0:
        print(f'{current_date} no US')
    day_sum = 0
    for korea in ['Mainland China']:
        day_sum += today_df.groupby('Country/Region').get_group(korea)[
            'Confirmed'].sum()
    if day_sum == 0:
        print(f'{current_date} no China')

    current_date = current_date + dt.timedelta(days=1)
# Only the first few days of Italy are missing

2020-01-22 no Italy
2020-01-23 no Italy
2020-01-24 no Italy
2020-01-25 no Italy
2020-01-26 no Italy
2020-01-27 no Italy
2020-01-28 no Italy
2020-01-29 no Italy
2020-01-30 no Italy
