In [1]:
import os
import sys
import pandas as pd

from datetime import date
import covid_etl as transforms

In [2]:
LOCATIONS_PATH = os.path.join(os.path.abspath('../../../'),
                        'COVID-19',
                        'csse_covid_19_data',
                        'UID_ISO_FIPS_LookUp_Table.csv')
print("Locations File: " + LOCATIONS_PATH)

Locations File: /Users/adammcquistan/code/ambassador/COVID-19/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv


In [3]:
TRANSFORMED_DATA_DIR = os.path.join(os.path.abspath('../../../'), 'COVID-19-TRANSFORMED')

print("Output Dir: " + TRANSFORMED_DATA_DIR)

Output Dir: /Users/adammcquistan/code/ambassador/COVID-19-TRANSFORMED


In [4]:
locations_df = pd.read_csv(LOCATIONS_PATH)
locations_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
0,4,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan,38928341.0
1,8,AL,ALB,8.0,,,,Albania,41.1533,20.1683,Albania,2877800.0
2,12,DZ,DZA,12.0,,,,Algeria,28.0339,1.6596,Algeria,43851043.0
3,20,AD,AND,20.0,,,,Andorra,42.5063,1.5218,Andorra,77265.0
4,24,AO,AGO,24.0,,,,Angola,-11.2027,17.8739,Angola,32866268.0


In [5]:
lower_combined_key = locations_df.Combined_Key.str.lower().values

In [6]:
locations_df = locations_df.set_index(lower_combined_key)

In [7]:
date_df = transforms.make_date_dims(transforms.COVID_DATA_START_DATE)
date_df.head()

Unnamed: 0,date_id,date,year,month,day_of_month,day_of_year,weekday
2020-01-22,1,2020-01-22,2020,1,22,22,2
2020-01-23,2,2020-01-23,2020,1,23,23,3
2020-01-24,3,2020-01-24,2020,1,24,24,4
2020-01-25,4,2020-01-25,2020,1,25,25,5
2020-01-26,5,2020-01-26,2020,1,26,26,6


In [8]:
files = [f for f in os.listdir(TRANSFORMED_DATA_DIR) 
         if f.startswith('transformed_')]

for f in files:
    file_path = os.path.join(TRANSFORMED_DATA_DIR, f)
    df = pd.read_csv(file_path)

    # date date_id column of df based off date column values
    df[transforms.DATE_ID_HEADER] = [date_df.loc[ds, 'date_id']
                                    for ds in df[transforms.DATE_HEADER]]

    df[transforms.LOCATION_ID_HEADER] = df[transforms.UID_HEADER].values
    

    for idx, row in df.loc[pd.isnull(df[transforms.UID_HEADER])].iterrows():
        if pd.notnull(row[transforms.FIPS_HEADER]) and int(row[transforms.FIPS_HEADER]) in locations_df.FIPS.values:
            loc_row = locations_df.loc[locations_df.FIPS == int(row[transforms.FIPS_HEADER])]
            df.loc[idx, transforms.LOCATION_ID_HEADER] = int(loc_row.UID)
        elif str(row[transforms.COMBINED_HEADER]) in locations_df.index.values:
            loc_row = locations_df.loc[row[transforms.COMBINED_HEADER]]
            df.loc[idx, transforms.LOCATION_ID_HEADER] = int(loc_row.UID)
        
    df2 = df.loc[pd.notnull(df[transforms.LOCATION_ID_HEADER])]
    if not df2.shape[0]:
        print('File {} will have not data'.format(f))
        print(df2.head())
        sys.exit(1)
    else:
        loadable_path = os.path.join(TRANSFORMED_DATA_DIR,
                                    f.replace('transformed_', 'loadable_'))
        df2[transforms.load_headers].to_csv(loadable_path, index=False)


In [9]:
date_df.to_csv(os.path.join(TRANSFORMED_DATA_DIR, 
                            'loadable_dates.csv'),
               index=False)


In [10]:
locations_df['city'] = locations_df.Combined_Key.apply(transforms.parse_city_from_combined_key)

In [12]:
locations_df = locations_df.rename(columns={
    'UID': 'location_id',
    'Province_State': 'state',
    'Country_Region': 'country',
    'Lat': transforms.LATITUDE_HEADER,
    'Long_': transforms.LONGITUDE_HEADER,
    'Population': 'population'
})

keep_columns = [
    'location_id',
    'country',
    'state',
    'city',
    transforms.LATITUDE_HEADER,
    transforms.LONGITUDE_HEADER,
    'population'
]
locations_df = locations_df[keep_columns]

In [13]:
locations_df.to_csv(os.path.join(TRANSFORMED_DATA_DIR, 
                                'loadable_locations.csv'),
                    index=False)