In [1]:
# Cleaning Raw Data from Johns Hopkins
# By Chris Chiang
import pandas as pd
import datetime as dt
import numpy as np

In [2]:
data_loc = '../raw_data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/'
# Setting up dates
start_date = dt.date(2020, 1, 22)
last_date = dt.date.today() - dt.timedelta(days=1)  
days = (last_date - start_date).days

In [3]:
# forloop init
current_date = start_date
combine_df = pd.DataFrame()
# i is the index of combine_df
cur_index = 0

for day in range(int(days)):
    # convert date into string and load in csv
    load_string = current_date.strftime("%m-%d-%Y")
    today_df = pd.read_csv(data_loc + load_string +
                               '.csv')
    try:
        today_group = today_df.groupby('Country/Region')
    except:
        # deal with data column format change
        today_group = today_df.groupby('Country_Region')
    # total count init
    day_deaths_tot = today_df['Deaths'].sum()
    day_confirmed_tot = today_df['Confirmed'].sum()
    # Loop through intrested countries
    current_df=pd.DataFrame()
    for country in today_group.groups:
        country_df = today_group.get_group(country)
        # Deal with different country naming conventions
        if country.find('Korea')!=-1:
            country_name = "South Korea"
        elif ((country.find('Taiwan')!=-1)|(country.find('Taipei')!=-1)):
            country_name = "Taiwan"
        elif country.find('China')!=-1:
            country_name = "China"
        elif country.find('Iran')!=-1:
            country_name = "Iran"
        elif country.find('Congo')!=-1:
            country_name = "Republic of the Congo"
        elif (country == 'UK') | (country == 'North Ireland'):
            country_name = "United Kingdom"
        elif country.find('Hong Kong')!=-1:
            country_name = "Hong Kong"
        elif country.find('Russia')!=-1:
            country_name = "Russia"
        elif ((country.find('Macao')!=-1)|(country.find('Macau')!=-1)):
            country_name = "Macao"
        elif country == 'St. Martin':
            country_name ='Saint Martin'
        elif country.find('Bahamas')!=-1:
            country_name ='Bahams'
        elif (country=='Faroe Islands')|(country=='Channel Islands')|(country=='Cayman Islands'):
            country_name ='United Kingdom'
        elif country =='US':
            country_name = 'United States of America'
        elif country =='Diamond Princess':
            country_name = 'Japan'  
        elif country == 'Republic of Ireland':
            country_name ='Ireland'
        elif country.find('Azerbaijan')!=-1:
            country_name='Azerbaijan'
        elif country.find('Palestin')!=-1:
            country_name='Palestine'
        elif country.find('Verde')!=-1:
            country_name='Cape Verde'
        elif country.find('Czech')!=-1:
            country_name='Czech Republic'
        elif country.find('Gambia')!=-1:
            country_name='Gambia'
        elif country == "Cote d'Ivoire":
            country_name = 'Ivory Coast'
        elif country == "Vietnam":
            country_name = 'Viet Nam'
        elif country.find('Moldova')!= -1:
            country_name = 'Moldova'
        else:
            country_name = country
        # Write to a daily temporary df
        if current_df.empty:
            row = pd.DataFrame({'Date': current_date, 'Country/Region': country_name,
                                        'Confirmed': country_df['Confirmed'].sum(), 'Deaths': country_df['Deaths'].sum()}, index=[cur_index])
            cur_index += 1
            current_df = pd.concat([row, current_df])
        elif current_df[current_df['Country/Region']==country_name].empty:
            row = pd.DataFrame({'Date': current_date, 'Country/Region': country_name,
                                        'Confirmed': country_df['Confirmed'].sum(), 'Deaths': country_df['Deaths'].sum()}, index=[cur_index])
            cur_index += 1
            current_df = pd.concat([row, current_df])
        else:
            # If country already exist today, add to number instead of creating new rows
            this_country_index_today = current_df[current_df['Country/Region']==country_name].index[0]
            death_temp = current_df.loc[this_country_index_today]['Deaths']
            cases_temp = current_df.loc[this_country_index_today]['Confirmed']
            current_df.at[this_country_index_today,'Deaths'] = death_temp +country_df['Deaths'].sum()
            current_df.at[this_country_index_today,'Confirmed'] = cases_temp +country_df['Confirmed'].sum()
    # add a total row
    row = pd.DataFrame({'Date': current_date, 'Country/Region': 'Total',
                                    'Confirmed': day_confirmed_tot, 'Deaths':  day_deaths_tot}, index=[cur_index]) 
    cur_index +=1
    current_df = pd.concat([row, current_df])
    # Add to a overall df
    combine_df=combine_df.append(current_df)
    current_date = current_date + dt.timedelta(days=1)
combine_df=combine_df.sort_index()
combine_df

Unnamed: 0,Date,Country/Region,Confirmed,Deaths
0,2020-01-22,Hong Kong,0.0,0.0
1,2020-01-22,Japan,2.0,0.0
2,2020-01-22,Macao,1.0,0.0
3,2020-01-22,China,547.0,17.0
4,2020-01-22,South Korea,1.0,0.0
...,...,...,...,...
12287,2020-05-07,Western Sahara,6.0,0.0
12288,2020-05-07,Yemen,25.0,5.0
12289,2020-05-07,Zambia,153.0,4.0
12290,2020-05-07,Zimbabwe,34.0,4.0


In [4]:
combine_df.dtypes

Date               object
Country/Region     object
Confirmed         float64
Deaths            float64
dtype: object

In [5]:
combine_df['Confirmed']=combine_df['Confirmed'].astype(int)
combine_df['Deaths']=combine_df['Deaths'].astype(int)
combine_df.dtypes


Date              object
Country/Region    object
Confirmed          int32
Deaths             int32
dtype: object

In [6]:
combine_df.to_csv('../cleaned_data/In_progress/covid_daily_world.csv', index=False)

In [7]:
# check country name formmating
combine_df['Country/Region'].unique()

array(['Hong Kong', 'Japan', 'Macao', 'China', 'South Korea', 'Taiwan',
       'Thailand', 'United States of America', 'Total', 'Australia',
       'Brazil', 'Colombia', 'Malaysia', 'Mexico', 'Philippines',
       'Singapore', 'Viet Nam', 'France', 'Nepal', 'Canada', 'Cambodia',
       'Ivory Coast', 'Sri Lanka', 'Germany', 'Finland',
       'United Arab Emirates', 'India', 'Italy', 'Russia', 'Sweden',
       'United Kingdom', 'Spain', 'Belgium', 'Others', 'Egypt', 'Iran',
       'Israel', 'Lebanon', 'Iraq', 'Afghanistan', 'Bahrain', 'Kuwait',
       'Oman', 'Algeria', 'Austria', 'Croatia', 'Switzerland', 'Georgia',
       'Greece', 'North Macedonia', 'Norway', 'Pakistan', 'Romania',
       'Denmark', 'Estonia', 'Netherlands', 'San Marino', 'Azerbaijan',
       'Belarus', 'Iceland', 'Lithuania', 'New Zealand', 'Nigeria',
       'Ireland', 'Luxembourg', 'Monaco', 'Qatar', 'Armenia',
       'Czech Republic', 'Dominican Republic', 'Ecuador', 'Andorra',
       'Indonesia', 'Latvia', 'Moroc