# Preprocess Data

Preprocess all data into a joined dataframe

In [36]:
import numpy as np
import pandas as pd
from datetime import datetime, timezone
from dateutil.parser import parse
import matplotlib.pyplot as plt
import math
import os
import pickle
import requests

In [2]:
# Check that the reference directory exists
for dirname, _, filenames in os.walk('../COVID-19/csse_covid_19_data/csse_covid_19_time_series/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv
../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv
../COVID-19/csse_covid_19_data/csse_covid_19_time_series/README.md
../COVID-19/csse_covid_19_data/csse_covid_19_time_series/.gitignore
../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv
../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv


In [3]:
us_confirmed_df = pd.read_csv('../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv')
world_confirmed_df = pd.read_csv('../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
us_deaths_df = pd.read_csv('../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')
world_deaths_df = pd.read_csv('../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')

In [22]:
us_confirmed_df[us_confirmed_df.FIPS <= 36.1][['UID',
 'iso2',
 'iso3',
 'code3',
 'FIPS',
 'Admin2',
 'Province_State',
 'Country_Region',
 'Lat',
 'Long_',
 'Combined_Key',]]

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key


In [24]:
# https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv
county_census_df = pd.read_csv('./data/co-est2019-alldata.csv', encoding='latin-1')
county_census_df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952


# Reformat data so dates are values instead of columns

In [23]:
def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False
    
def get_county(string):
    str_array = string.split(',')
    if len(str_array) < 3:
        return ''
    else:
        return str_array[0]

In [38]:
date_cols = [x for x in list(us_confirmed_df) if is_date(x)]
dates = [datetime.strptime(x , '%m/%d/%y') for x in date_cols]

column_names = ['FIPS', 'County', 'Province_State', 'Country_Region', 'Date', 'Cases', 'Deaths', 'Population']

combined_df = pd.DataFrame(columns=column_names)

for index, row in us_confirmed_df.iterrows():
    fips = row['FIPS']
    if math.isnan(fips):
        print('skipping county', county, fips, population)
        continue
    population = county_census_df[
        (county_census_df.STATE == int(fips / 1000))
        & (county_census_df.COUNTY == int(fips % 1000))]['POPESTIMATE2019']
    if len(population) != 1:
        print('skipping county', county, fips, population)
        continue
    county = get_county(row['Combined_Key'])
    population = population.to_numpy()[0]
    for (date_col, date) in zip(date_cols, dates):
        confirmed = row[date_col]
        if confirmed == 0:
            continue
        if date_col in us_deaths_df:
            deaths = us_deaths_df[us_deaths_df.FIPS == row['FIPS']][date_col].to_numpy()[0]
        else:
            deaths = 0
            
        values = [fips, county, row['Province_State'], row['Country_Region'], date, confirmed, deaths, population]
        df_length = len(combined_df)
        combined_df.loc[df_length] = values
    if index % 100 == 0:
        print('processed {} out of {}'.format(index, len(us_confirmed_df)))
combined_df.head()            
            
    
    

skipping county Weston 60.0 Series([], Name: POPESTIMATE2019, dtype: int64)
skipping county Weston 66.0 Series([], Name: POPESTIMATE2019, dtype: int64)
skipping county Weston 69.0 Series([], Name: POPESTIMATE2019, dtype: int64)
skipping county Weston 72.0 Series([], Name: POPESTIMATE2019, dtype: int64)
skipping county Weston 78.0 Series([], Name: POPESTIMATE2019, dtype: int64)
processed 100 out of 3253
processed 200 out of 3253
processed 300 out of 3253
processed 400 out of 3253
processed 500 out of 3253
processed 600 out of 3253
processed 700 out of 3253
processed 800 out of 3253
processed 900 out of 3253
processed 1000 out of 3253
processed 1100 out of 3253
processed 1200 out of 3253
processed 1300 out of 3253
processed 1400 out of 3253
processed 1500 out of 3253
processed 1600 out of 3253
processed 1700 out of 3253
processed 1800 out of 3253
processed 1900 out of 3253
processed 2000 out of 3253
processed 2100 out of 3253
processed 2200 out of 3253
processed 2300 out of 3253
processe

Unnamed: 0,FIPS,County,Province_State,Country_Region,Date,Cases,Deaths,Population
0,1001.0,Autauga,Alabama,US,2020-03-24,1,0,55869
1,1001.0,Autauga,Alabama,US,2020-03-25,4,0,55869
2,1001.0,Autauga,Alabama,US,2020-03-26,6,0,55869
3,1001.0,Autauga,Alabama,US,2020-03-27,6,0,55869
4,1001.0,Autauga,Alabama,US,2020-03-28,6,0,55869


In [41]:
with open('./data/us_combined_df.pkl', 'wb') as f:
    pickle.dump(combined_df, f)