In [8]:
import os
import sys
import pandas as pd
import numpy as np

from datetime import datetime
import covid_etl as transforms

In [9]:
INPUT_DATA_DIR = os.path.join(os.path.abspath('../../../'),
                        'COVID-19',
                        'csse_covid_19_data',
                        'csse_covid_19_daily_reports_us')
print("Input Dir: " + INPUT_DATA_DIR)

Input Dir: /Users/adammcquistan/code/ambassador/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us


In [10]:
TRANSFORMED_DATA_DIR = os.path.join(os.path.abspath('../../../'), 'COVID-19-TRANSFORMED')

if not os.path.exists(TRANSFORMED_DATA_DIR):
    os.makedirs(TRANSFORMED_DATA_DIR)

print("Output Dir: " + TRANSFORMED_DATA_DIR)

Output Dir: /Users/adammcquistan/code/ambassador/COVID-19-TRANSFORMED


In [11]:
# Fix any BOM files (there are some early on ones in Jan 2020, could be more later)

input_files = [f for f in os.listdir(INPUT_DATA_DIR) if f.endswith('.csv')]

for f in input_files:
    input_f = os.path.join(INPUT_DATA_DIR, f)
    output_f = os.path.join(TRANSFORMED_DATA_DIR, 'us_'+f)
    with open(input_f, mode='r', encoding='utf-8-sig') as fin, open(output_f, mode='w', encoding='utf-8') as fout:
        fout.write(fin.read())

In [12]:
# remap headers to consistent format

files = [f for f in os.listdir(TRANSFORMED_DATA_DIR) if f.startswith('us_')]
for f in files:
    fname, fext = os.path.splitext(f)
    date_str = fname.replace('us_', '')
    file_path = os.path.join(TRANSFORMED_DATA_DIR, f)
    with open(file_path) as fp:
        headers = fp.readline().strip()
        df = pd.read_csv(file_path)
        if headers not in transforms.known_headers:
            print("{} has unrecognized headers {}".format(f, headers))
            df.head()
            sys.exit(1)

        print('Transforming {}'.format(f))
        
        transformed_df = transforms.transform_headers(df, date_str)
    transformed_path = os.path.join(TRANSFORMED_DATA_DIR, 'transformed_'+date_str+'.csv')

    if os.path.exists(transformed_path):
        global_df = pd.read_csv(transformed_path)
        for country in transformed_df.country.unique():
            global_df = global_df.loc[global_df.country != country]
        transformed_df = pd.concat([transformed_df, global_df])

    transformed_df.to_csv(transformed_path)

Transforming us_06-08-2020.csv
Transforming us_08-11-2020.csv
Transforming us_08-10-2020.csv
Transforming us_06-09-2020.csv
Transforming us_12-01-2020.csv
Transforming us_09-13-2020.csv
Transforming us_09-12-2020.csv
Transforming us_10-30-2020.csv
Transforming us_10-31-2020.csv
Transforming us_05-05-2020.csv
Transforming us_05-04-2020.csv
Transforming us_05-31-2020.csv
Transforming us_05-30-2020.csv
Transforming us_06-02-2020.csv
Transforming us_06-03-2020.csv
Transforming us_10-04-2020.csv
Transforming us_10-05-2020.csv
Transforming us_09-27-2020.csv
Transforming us_09-26-2020.csv
Transforming us_08-25-2020.csv
Transforming us_08-24-2020.csv
Transforming us_11-06-2020.csv
Transforming us_11-07-2020.csv
Transforming us_09-19-2020.csv
Transforming us_09-18-2020.csv
Transforming us_07-01-2020.csv
Transforming us_08-06-2020.csv
Transforming us_08-07-2020.csv
Transforming us_11-25-2020.csv
Transforming us_11-24-2020.csv
Transforming us_07-23-2020.csv
Transforming us_07-22-2020.csv
Transfor

In [6]:
transformed_df.head()

Unnamed: 0.1,date,city,state,country,latitude,longitude,cases,deaths,recoveries,testing_rate,hospitalization_rate,cases_100K,combined_key,FIPS,UID,Unnamed: 0
0,2020-05-17 00:00:00,,Alabama,US,32.3182,-86.9023,12137,488,,3188.743643,11.825673,240.068445,"alabama, us",1.0,84000001.0,
1,2020-05-17 00:00:00,,Alaska,US,61.3707,-152.4044,388,10,344.0,4736.687422,,53.038432,"alaska, us",2.0,84000002.0,
2,2020-05-17 00:00:00,,American Samoa,US,-14.271,-170.132,0,0,,188.709764,,0.0,"american samoa, us",60.0,16.0,
3,2020-05-17 00:00:00,,Arizona,US,33.7298,-111.4312,13945,680,3450.0,2084.996573,12.183578,191.585962,"arizona, us",4.0,84000004.0,
4,2020-05-17 00:00:00,,Arkansas,US,34.9697,-92.3731,4759,98,3590.0,2822.747932,10.926665,157.697452,"arkansas, us",5.0,84000005.0,


In [7]:
for country in transformed_df.country.unique():
    print(country)

US
Italy
Canada
Spain
United Kingdom
China
Netherlands
Australia
Germany
Denmark
France
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burma
Burundi
Cabo Verde
Cambodia
Cameroon
Central African Republic
Chad
Chile
Colombia
Comoros
Congo (Brazzaville)
Congo (Kinshasa)
Costa Rica
Cote d'Ivoire
Croatia
Cuba
Cyprus
Czechia
Diamond Princess
Djibouti
Dominica
Dominican Republic
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Fiji
Finland
Gabon
Gambia
Georgia
Ghana
Greece
Grenada
Guatemala
Guinea
Guinea-Bissau
Guyana
Haiti
Holy See
Honduras
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Jamaica
Japan
Jordan
Kazakhstan
Kenya
Korea, South
Kosovo
Kuwait
Kyrgyzstan
Laos
Latvia
Lebanon
Lesotho
Liberia
Libya
Liechtenstein
Lithuania
Luxembourg
MS Zaandam
Madagasc