In [7]:
import os
import shutil
import sys
import pandas as pd
import numpy as np

from datetime import datetime
import covid_etl as transforms

In [8]:
INPUT_DATA_DIR = os.path.join(os.path.abspath('../../../'),
                        'COVID-19',
                        'csse_covid_19_data',
                        'csse_covid_19_daily_reports')
print("Input Dir: " + INPUT_DATA_DIR)

Input Dir: /Users/adammcquistan/code/ambassador/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports


In [9]:
TRANSFORMED_DATA_DIR = os.path.join(os.path.abspath('../../../'), 'COVID-19-TRANSFORMED')

if os.path.exists(TRANSFORMED_DATA_DIR):
    shutil.rmtree(TRANSFORMED_DATA_DIR)

os.makedirs(TRANSFORMED_DATA_DIR)

print("Output Dir: " + TRANSFORMED_DATA_DIR)

Output Dir: /Users/adammcquistan/code/ambassador/COVID-19-TRANSFORMED


In [10]:
# Fix any BOM files (there are some early on ones in Jan 2020, could be more later)

input_files = [f for f in os.listdir(INPUT_DATA_DIR) if f.endswith('.csv')]

for f in input_files:
    input_f = os.path.join(INPUT_DATA_DIR, f)
    output_f = os.path.join(TRANSFORMED_DATA_DIR, 'global_'+f)
    with open(input_f, mode='r', encoding='utf-8-sig') as fin, open(output_f, mode='w', encoding='utf-8') as fout:
        fout.write(fin.read())

In [11]:
# remap headers to consistent format

files = [f for f in os.listdir(TRANSFORMED_DATA_DIR) if f.startswith('global_')]

for f in files:
    fname, fext = os.path.splitext(f)
    date_str = fname.replace('global_', '')
    file_path = os.path.join(TRANSFORMED_DATA_DIR, f)
    with open(file_path) as fp:
        headers = fp.readline().strip()
        if headers not in transforms.known_headers:
            print("{} has unrecognized headers {}".format(f, headers))
            sys.exit(1)

        print('Transforming {}'.format(f))
        df = pd.read_csv(file_path)
        transformed_df = transforms.transform_headers(df, date_str)
    transformed_path = os.path.join(TRANSFORMED_DATA_DIR, 'transformed_'+date_str+'.csv')
    transformed_df.to_csv(transformed_path)

Transforming global_04-17-2020.csv
Transforming global_04-16-2020.csv
Transforming global_07-24-2020.csv
Transforming global_07-25-2020.csv
Transforming global_08-01-2020.csv
Transforming global_06-18-2020.csv
Transforming global_06-19-2020.csv
Transforming global_11-22-2020.csv
Transforming global_11-23-2020.csv
Transforming global_03-05-2020.csv
Transforming global_03-04-2020.csv
Transforming global_10-20-2020.csv
Transforming global_10-21-2020.csv
Transforming global_09-03-2020.csv
Transforming global_09-02-2020.csv
Transforming global_04-29-2020.csv
Transforming global_04-28-2020.csv
Transforming global_05-15-2020.csv
Transforming global_05-14-2020.csv
Transforming global_06-26-2020.csv
Transforming global_06-27-2020.csv
Transforming global_02-07-2020.csv
Transforming global_02-06-2020.csv
Transforming global_05-21-2020.csv
Transforming global_05-20-2020.csv
Transforming global_06-12-2020.csv
Transforming global_06-13-2020.csv
Transforming global_11-28-2020.csv
Transforming global_

In [6]:
transformed_df.head()

Unnamed: 0,date,city,state,country,latitude,longitude,cases,deaths,recoveries,testing_rate,hospitalization_rate,cases_100K,combined_key,FIPS,UID
0,2020-05-07,,South Carolina,US,34.223334,-82.461707,34,0,0,,,,"abbeville, south carolina, us",45001.0,
1,2020-05-07,,Louisiana,US,30.295065,-92.414197,142,11,0,,,,"acadia, louisiana, us",22001.0,
2,2020-05-07,,Virginia,US,37.767072,-75.632346,463,7,0,,,,"accomack, virginia, us",51001.0,
3,2020-05-07,,Idaho,US,43.452658,-116.241552,721,19,0,,,,"ada, idaho, us",16001.0,
4,2020-05-07,,Iowa,US,41.330756,-94.471059,3,0,0,,,,"adair, iowa, us",19001.0,
