In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
# Week of year, calculated as the number as of EOD Saturday of that week.
# Saturday
month_day = [
    '01-04',
    '01-11',
    '01-18',
    '01-25',
    '02-01',
    '02-08',
    '02-15',
    '02-22',
    '02-29',
    '03-07',
    '03-14',
    '03-21',
    '03-28',
    '04-04',
    '04-11',
    '04-18',
    '04-25',
    '05-02',
    '05-09',
    '05-16'
]
weeks = ['2020-'+md for md in month_day]

In [3]:
# Load the list of zips and fips used in project
zip2fips = pd.read_csv("zip2fits.csv")

### Cases and deaths by FIPS

In [4]:
''' Load NYTimes COVID cases and deaths by county data
 Source: https://github.com/nytimes/covid-19-data
Features:
cases - Number of cases at a given time (it's a running total, not new cases)
deaths - Number of daths at a given time (it's a running total, not new cases)
Could add net new cases/deaths as features.
'''
df = pd.read_csv("covid-19-data/us-counties.csv")
# Remove NaN fips codes
df.dropna(axis=0,subset=['fips'],inplace=True)
# Convert fips float to int, as fips_int column
df['fips_int'] = [int(fip) for fip in df['fips']]
# Sort by FIPS then date
nytimes_covid_cases = df[df['date'].isin(weeks)].sort_values(by=['fips_int','date'])


In [5]:
nytimes_covid_cases['week'] = [datetime.strptime(dat,"%Y-%m-%d").isocalendar()[1] for dat in nytimes_covid_cases['date']]

In [6]:
# Walk FIPS->ZIP
nytimes_covid_cases = nytimes_covid_cases.merge(zip2fips[['FIPS','ZIP']], \
                                                left_on='fips_int', \
                                                right_on='FIPS') \
                                                .drop(['fips_int', \
                                                       'fips'],axis=1)

In [7]:
# Add zeroes for missing, early weeks
a = nytimes_covid_cases['ZIP'].value_counts()
tens = a[a==10]
nines = a[a==9]
for zipp in tens.index:        
    record = nytimes_covid_cases[(nytimes_covid_cases['ZIP']==zipp) & (nytimes_covid_cases['week']==15)].copy()
    record['deaths']=0
    record['cases']=0
    record['week']=9
    nytimes_covid_cases = nytimes_covid_cases.append(record)
for zipp in nines.index:        
    record9 = nytimes_covid_cases[(nytimes_covid_cases['ZIP']==zipp) & (nytimes_covid_cases['week']==15)].copy()
    record9['deaths']=0
    record9['cases']=0
    record9['week']=9
    record10 = record9.copy()
    record10['week']=10
    nytimes_covid_cases = nytimes_covid_cases.append(record9)
    nytimes_covid_cases = nytimes_covid_cases.append(record10)

### 2017 Employment by NAICS by Zipcode

In [21]:
'''
Source: https://www.census.gov/data/datasets/2017/econ/cbp/2017-cbp.html

Features:
- establishment count by naics code for each zipcode (from 2017)
- employee and establishment count for each zipcode (from 2017)
Could add: Number of establishments by biz size (headcount)
'''
# Detail file has businesses by naics code and headcount
emp_df = pd.read_csv("zbp17detail.txt")

# Pivot so that establishments by naics code are the columns and zip codes are the rows.
est_by_zip_naics = emp_df.pivot(index='zip',columns='naics',values='est')

# Remove /// from naics code names
cols = est_by_zip_naics.columns
cols_clean = ['num_biz_'+col.strip('/') for col in cols]
cols_dict = dict(zip(cols,cols_clean))
est_by_zip_naics.rename(columns=cols_dict,inplace=True)

# Replace NaN's with zeros
est_by_zip_naics.fillna(0,inplace=True)

In [22]:
# Total employment count and establishment count by zipcode.  Latest data is 2017.
emp_tot_df = pd.read_csv("zbp17totals.txt")

# Taking midpoints
emp_codes ={"A": 10, 
            "B": 80/2+20,
            "C": (150/2)+100,
            "E": (500-250)/2+250,
            "F": 500/2+500,
            "G": (2500-1000)/2+1000,
            "H": (5000-2500)/2+2500,
            "I": (10000-5000)/2+5000,
            "J": (15000/2)+10000,
            "K": (25000/2)+25000,
            "L": (50000/2)+50000,
            "M": 250000
            }

# Lookup the code employment estimates
for index, row in emp_tot_df[emp_tot_df['empflag'].notna()].iterrows():
    emp_tot_df.loc[index,'emp'] = emp_codes[row['empflag']]

# Just employent and number of establishments by zip code
emp_tot_output = emp_tot_df[['zip','emp','est']]

### Demographics by FIPS Code

In [23]:
''' Source: https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-total.html 
            (datasets section at bottom of page)
    Features (by FIPS:
    POPESTIMATE2019 - population estimate for 2019 based on 2010 census, projected growth
    Pop_pct_chg_2019 - Projected pct change in population from July 2018 - July 2019            
    Possible other features: birth rate, death rate, migration rate.  Decomposition of pop growth rate
'''

' Source: https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-total.html \n            (datasets section at bottom of page)\n    Features (by FIPS:\n    POPESTIMATE2019 - population estimate for 2019 based on 2010 census, projected growth\n    Pop_pct_chg_2019 - Projected pct change in population from July 2018 - July 2019            \n    Possible other features: birth rate, death rate, migration rate.  Decomposition of pop growth rate\n'

In [24]:
demog_df = pd.read_csv("co-est2019-alldata.csv",encoding='latin-1')

# Combine state-county for fips code
demog_df['fips'] = demog_df['STATE']*1000+demog_df['COUNTY']
demog_df['Pop_pct_chg_2019'] = demog_df['NPOPCHG_2019']/demog_df['POPESTIMATE2019']
demog_output_df = demog_df[['fips','POPESTIMATE2019','Pop_pct_chg_2019']]

In [25]:
demog_output_df = demog_output_df.merge(zip2fips[['FIPS','ZIP']], \
                                                left_on='fips', \
                                                right_on='FIPS') \
                                                .drop(['fips'],axis=1)

### Weather Data

In [None]:
'''
Sources: https://console.cloud.google.com/bigquery?project=my-project-2353-182914&folder=&organizationId=&j=bq:US:bquxjob_451482fc_1720b3f21bd&page=queryresults
and to map zips to stations: https://get-weather-data.readthedocs.io/en/latest/zip2ws.html
'''

In [9]:

# Load zip lat-long file
latlonmap = pd.read_csv("us-zip-code-latitude-and-longitude.csv",sep=";")

# Load and combine the raw stataion weatherdatasets
w1 = pd.read_csv("2151907.csv")
w2 = pd.read_csv("2151908.csv")
w3 = pd.read_csv("2151909.csv")
drop_cols = ['PSUN','TSUN','WT05','WT09','WT07']
w1 = w1.drop(drop_cols,axis=1)
w2 = w2.drop(drop_cols[:4],axis=1)
weather_stations_df = pd.concat([w1,w2,w3],axis=0)

# Drop stations without temp readings
weather_stations_df.dropna(axis=0,subset=['TMIN','TMAX'],inplace=True)

In [10]:
# Finding the closest station
# Create df of stations and their lat-long for finding closest
stationlatlon = weather_stations_df.groupby('STATION').max()[['LATITUDE','LONGITUDE']]

# Vlookup lat lon for each zip code, for finding closest station
ziplatlon = zip2fips.merge(latlonmap,left_on='ZIP',right_on='Zip')[['ZIP','FIPS','Latitude','Longitude']]    

# Calculate and map the closest station for each zip code
closest_station=[]
for index,row in ziplatlon.iterrows():

    st_dist = []
    for st_index,station in stationlatlon.iterrows():
        deglen = 110.25
        x = station['LATITUDE']-row['Latitude']
        y = (station['LONGITUDE']-row['Longitude'])*np.cos(row['Latitude']*np.pi/180)
        dist = deglen*np.sqrt(x**2+y**2)
        st_dist.append([station.name,dist])
    st_dist_df = pd.DataFrame(st_dist)
    c_s = st_dist_df.loc[st_dist_df[1].idxmin()][0]
    closest_station.append(c_s)
ziplatlon['closest_station']=closest_station

In [11]:
# Combine station weather data and zip list
weather_df = ziplatlon.merge(weather_stations_df,left_on="closest_station",right_on="STATION")

In [12]:
# Map days to their weeks
weather_df['week'] = [datetime.strptime(dat,"%Y-%m-%d").isocalendar()[1] for dat in weather_df['DATE']]
# fill nan's for weather fields
weather_df[['SNOW','PRCP','WT01','WT02','WT03','WT04','WT06','WT08','WT11']]= \
weather_df[['SNOW','PRCP','WT01','WT02','WT03','WT04','WT06','WT08','WT11']].fillna(0)

#Group by week and create agg features: min temp ,max temp, sum of precip, 
weather_shaped = weather_df.groupby(['ZIP','week']).agg({'TMIN': 'min', 
                                'TMAX': 'max',
                                'PRCP':'sum',
                                'SNOW':'sum',
                                'WT01':'sum', 
                                'WT02':'sum',
                                'WT03':'sum', 
                                'WT04':'sum', 
                                'WT06':'sum', 
                                'WT08':'sum', 
                                'WT11':'sum'})

# Reset index so that ZIP and week are columns
weather_shaped.reset_index(inplace=True)

## Combine data!

In [26]:
est_by_zip_naics.head()

naics,num_biz_------,num_biz_11----,num_biz_113,num_biz_1131,num_biz_11311,num_biz_113110,num_biz_1133,num_biz_11331,num_biz_113310,num_biz_114,...,num_biz_813910,num_biz_81392,num_biz_813920,num_biz_81393,num_biz_813930,num_biz_81394,num_biz_813940,num_biz_81399,num_biz_813990,num_biz_99----
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,473.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1002,545.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1003,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
1004,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1005,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
demog_output_df.head()

Unnamed: 0,POPESTIMATE2019,Pop_pct_chg_2019,FIPS,ZIP
0,759297,0.005783,13089,30307
1,759297,0.005783,13089,30315
2,759297,0.005783,13089,30319
3,759297,0.005783,13089,30360
4,759297,0.005783,13089,30306


In [28]:
emp_tot_output.head()

Unnamed: 0,zip,emp,est
0,1001,9891.0,473
1,1002,8482.0,545
2,1003,337.0,21
3,1004,213.0,12
4,1005,1248.0,94


In [31]:
# Combine non-date datasets
def combine_non_date(demog_output_df,emp_tot_output,est_by_zip_naics):
    a = demog_output_df.merge(emp_tot_output,how='left',left_on='ZIP',right_on='zip')
    b = a.merge(est_by_zip_naics,how='left',left_on='ZIP',right_on='zip')
    return b
non_date_features = combine_non_date(demog_output_df,emp_tot_output,est_by_zip_naics)

In [13]:
zip2fips.shape

(603, 3)

In [14]:
print(weather_shaped.shape)
weather_shaped.head()

(7590, 13)


Unnamed: 0,ZIP,week,TMIN,TMAX,PRCP,SNOW,WT01,WT02,WT03,WT04,WT06,WT08,WT11
0,10001,8,20.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10001,9,25.0,62.0,0.54,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0
2,10001,10,35.0,59.0,0.63,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10001,11,40.0,72.0,0.29,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0
4,10001,12,33.0,77.0,1.09,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0


In [18]:
weather_shaped['ZIP'].value_counts()

10211    13
11416    13
11252    13
11248    13
11244    13
         ..
48222     8
48243     8
70183     4
70141     4
70181     4
Name: ZIP, Length: 595, dtype: int64

In [17]:
nytimes_covid_cases['week'].value_counts()

19    603
15    603
11    603
18    603
14    603
10    603
17    603
13    603
9     603
16    603
12    603
Name: week, dtype: int64

In [15]:
print(nytimes_covid_cases.shape)
nytimes_covid_cases.head()

(6633, 8)


Unnamed: 0,date,county,state,cases,deaths,week,FIPS,ZIP
0,2020-03-14,DeKalb,Georgia,8,0,11,13089,30307
1,2020-03-14,DeKalb,Georgia,8,0,11,13089,30315
2,2020-03-14,DeKalb,Georgia,8,0,11,13089,30319
3,2020-03-14,DeKalb,Georgia,8,0,11,13089,30360
4,2020-03-14,DeKalb,Georgia,8,0,11,13089,30306


In [32]:
non_date_features.head()

Unnamed: 0,POPESTIMATE2019,Pop_pct_chg_2019,FIPS,ZIP,zip,emp,est,num_biz_------,num_biz_11----,num_biz_113,...,num_biz_813910,num_biz_81392,num_biz_813920,num_biz_81393,num_biz_813930,num_biz_81394,num_biz_813940,num_biz_81399,num_biz_813990,num_biz_99----
0,759297,0.005783,13089,30307,30307.0,7201.0,771.0,771.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,759297,0.005783,13089,30315,30315.0,7460.0,400.0,400.0,0.0,0.0,...,0.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0
2,759297,0.005783,13089,30319,30319.0,16729.0,1072.0,1072.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,759297,0.005783,13089,30360,30360.0,5649.0,447.0,447.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,759297,0.005783,13089,30306,30306.0,6471.0,781.0,781.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Combine date datasets
date_features = weather_shaped.merge(nytimes_covid_cases,how='right',on=['ZIP','week'])
date_features.head()

Unnamed: 0,ZIP,week,TMIN,TMAX,PRCP,SNOW,WT01,WT02,WT03,WT04,WT06,WT08,WT11,date,county,state,cases,deaths,FIPS
0,10001,9,25.0,62.0,0.54,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,2020-04-11,New York City,New York,0,0,36061
1,10001,10,35.0,59.0,0.63,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2020-03-07,New York City,New York,12,0,36061
2,10001,11,40.0,72.0,0.29,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,2020-03-14,New York City,New York,269,1,36061
3,10001,12,33.0,77.0,1.09,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2020-03-21,New York City,New York,6226,75,36061
4,10001,13,36.0,69.0,1.67,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2020-03-28,New York City,New York,30919,825,36061


In [37]:
# Map non-date features onto date_features
feature_output = date_features.merge(non_date_df,how='left',on='ZIP')
feature_output.to_csv("external_features.csv")

In [None]:
# To Do
# Weather missing data
# Look into demographic missing data
# Unacast