In [1]:
import numpy as np
import pandas as pd
import datetime

In [2]:
def naics_employment(week,zipp,naics,feature_output):
    '''
    Inputs: 
        week - int or string
        zip - int
        naics - scalar
        feature_output - the dataframe from external_features.csv, or a Dataframe that contains that data

    Return:
        naics_columns: 1x5 dataframe with the establishment count by 2,3,4,5, and 6-digit naics codes as columns
        '''
    if len(str(week))==1:
        week_str = '0'+str(week)
    else:
        week_str = str(week)
    if naics==0:
        naics_2 = 'num_biz_00----'
        naics_3 = 'num_biz_000'
        naics_4 = 'num_biz_0000'
        naics_5 = 'num_biz_00000'
        naics_6 = 'num_biz_000000'
    else:
        naics_2 = 'num_biz_'+str(naics)[:2]+'----'
        naics_3 = 'num_biz_'+str(naics)[:3]
        naics_4 = 'num_biz_'+str(naics)[:4]
        naics_5 = 'num_biz_'+str(naics)[:5]
        naics_6 = 'num_biz_'+str(naics)[:6]
    naics_columns = feature_output[(feature_output['week']==week)&(feature_output['ZIP']==int(zipp))] \
    [[naics_2,naics_3,naics_4,naics_5,naics_6]]
    naics_columns.rename(columns={naics_2: "naics_2_num_biz",
                                  naics_3: "naics_3_num_biz",
                                  naics_4: "naics_4_num_biz",
                                  naics_5: "naics_5_num_biz",
                                  naics_6: "naics_6_num_biz"},
                                  inplace=True)
    return naics_columns

In [3]:
# import data
data = pd.read_csv('safegraph.csv.gz')
external = pd.read_csv('external_features.zip')
npis = pd.read_csv('npi_policies_by_zip_updated.csv')

In [4]:
external = external.drop(columns='Unnamed: 0')

In [5]:
npis = npis.drop(columns='Unnamed: 0')

In [6]:
# merge data with external features and with NPI features
data_with_external = data.merge(external, how='left', left_on=['postal_code','week'],
                                right_on=['ZIP','week'])
data_with_npi = data_with_external.merge(npis, how='left', left_on='postal_code',
                                         right_on='ZIP')

In [7]:
# create features for percentage of days of the week that interventions were in effect
npi_columns = ['closing_of_public_venues','non-essential_services_closure',
               'school_closure','shelter_in_place','social_distancing']
for col in npi_columns:
    weekcol = col + '_week'
    wdaycol = col + '_weekday'
    data_with_npi[col+'_pct'] = data_with_npi.apply(lambda row: 1. if row[weekcol] < row['week']
                                                    else 0. if row[weekcol] > row['week']
                                                    else (7 - row[wdaycol]) / 7., axis=1)

In [8]:
# create naics number of businesses features
n_index = len(data_with_npi.index)
data_with_npi['naics_2_num_biz'] = np.zeros(n_index)
data_with_npi['naics_3_num_biz'] = np.zeros(n_index)
data_with_npi['naics_4_num_biz'] = np.zeros(n_index)
data_with_npi['naics_5_num_biz'] = np.zeros(n_index)
data_with_npi['naics_6_num_biz'] = np.zeros(n_index)
missing_zips = set()
n_missing = 0

for i in range(n_index):
    if data_with_npi.loc[i,'naics_code'] == 0:
        data_with_npi.loc[i,'naics_2_num_biz'] = 0
        data_with_npi.loc[i,'naics_3_num_biz'] = 0
        data_with_npi.loc[i,'naics_4_num_biz'] = 0
        data_with_npi.loc[i,'naics_5_num_biz'] = 0
        data_with_npi.loc[i,'naics_6_num_biz'] = 0
    else:
        naics_columns = naics_employment(data_with_npi.loc[i,'week'], 
                                         data_with_npi.loc[i,'postal_code'],
                                         data_with_npi.loc[i,'naics_code'],
                                         external)
        if len(naics_columns)==0:
            missing_zips.add(data_with_npi.loc[i,'postal_code'])
            n_missing += 1
        else:
            data_with_npi.loc[i,'naics_2_num_biz'] = naics_columns['naics_2_num_biz'].values[0]
            data_with_npi.loc[i,'naics_3_num_biz'] = naics_columns['naics_3_num_biz'].values[0]
            data_with_npi.loc[i,'naics_4_num_biz'] = naics_columns['naics_4_num_biz'].values[0]
            data_with_npi.loc[i,'naics_5_num_biz'] = naics_columns['naics_5_num_biz'].values[0]
            data_with_npi.loc[i,'naics_6_num_biz'] = naics_columns['naics_6_num_biz'].values[0]

In [11]:
npi_columns_week = [col+'_week' for col in npi_columns]
npi_columns_wday = [col+'_weekday' for col in npi_columns]
external_columns = list(external.columns.values[2:])
external_columns.append('ZIP_x')
dropcols = npi_columns + npi_columns_week + npi_columns_wday + external_columns
full_dataset = data_with_npi.drop(columns=dropcols)

In [12]:
# save as pickle
full_dataset.to_pickle('full_dataset.csv.gz', compression='gzip')

## Data with missing values - it's only ones with missing zip codes or missing naics codes, both of which make naics_x_num_biz=0

In [9]:
n_missing

3573

In [13]:
len(missing_zips)

334

In [10]:
missing_zips

{7002,
 7003,
 7006,
 7013,
 7017,
 7024,
 7026,
 7028,
 7030,
 7033,
 7035,
 7036,
 7046,
 7055,
 7057,
 7060,
 7062,
 7063,
 7065,
 7069,
 7070,
 7073,
 7074,
 7075,
 7080,
 7087,
 7090,
 7092,
 7093,
 7094,
 7095,
 7103,
 7105,
 7106,
 7107,
 7112,
 7302,
 7304,
 7305,
 7307,
 7310,
 7403,
 7417,
 7424,
 7430,
 7450,
 7458,
 7503,
 7504,
 7506,
 7508,
 7514,
 7603,
 7604,
 7621,
 7624,
 7627,
 7631,
 7640,
 7642,
 7643,
 7644,
 7646,
 7649,
 7652,
 7656,
 7661,
 7675,
 7677,
 7701,
 7704,
 7716,
 7719,
 7724,
 7732,
 7733,
 7734,
 7735,
 7739,
 7740,
 7746,
 7747,
 7753,
 7758,
 7760,
 7801,
 7850,
 7866,
 7871,
 7928,
 7940,
 7945,
 7950,
 7960,
 8005,
 8006,
 8008,
 8701,
 8722,
 8735,
 8741,
 8742,
 8752,
 8753,
 8755,
 8757,
 8759,
 8805,
 8807,
 8829,
 8830,
 8846,
 8852,
 8861,
 8869,
 8872,
 8876,
 8879,
 8882,
 8901,
 10504,
 10507,
 10510,
 10520,
 10523,
 10530,
 10532,
 10533,
 10536,
 10538,
 10543,
 10552,
 10553,
 10562,
 10566,
 10567,
 10570,
 10580,
 10583,
 10591,


In [14]:
full_dataset[(full_dataset['naics_2_num_biz']==0)&(full_dataset['naics_3_num_biz']==0)\
            &(full_dataset['naics_4_num_biz']==0)&(full_dataset['naics_5_num_biz']==0)\
            &(full_dataset['naics_6_num_biz']==0)]

Unnamed: 0,week,visits_2020,postal_code,naics_code,raw_visitor_counts,median_dwell_2020,num_visitor_country_of_origin,num_visitor_home_cbgs,num_related_same_day_brand_2020,max_hourly_visits,...,closing_of_public_venues_pct,non-essential_services_closure_pct,school_closure_pct,shelter_in_place_pct,social_distancing_pct,naics_2_num_biz,naics_3_num_biz,naics_4_num_biz,naics_5_num_biz,naics_6_num_biz
68,10,183,30080,713990,146,41.0,1,5,9,9,...,,,,,,0.0,0.0,0.0,0.0,0.0
114,10,91,98037,712190,61,51.0,1,1,1,6,...,,,,,,0.0,0.0,0.0,0.0,0.0
156,10,99,11507,712190,63,36.0,1,1,4,6,...,,,,,,0.0,0.0,0.0,0.0,0.0
210,10,74,98201,712190,68,29.5,1,0,8,5,...,,,,,,0.0,0.0,0.0,0.0,0.0
326,10,93,30114,712190,86,22.0,1,3,9,10,...,,,,,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228780,15,18,7677,712190,16,23.0,1,3,3,3,...,,,,,,0.0,0.0,0.0,0.0,0.0
228826,15,37,7030,712190,25,20.0,1,4,2,3,...,,,,,,0.0,0.0,0.0,0.0,0.0
228846,15,5,8759,713940,5,40.0,1,0,0,1,...,,,,,,0.0,0.0,0.0,0.0,0.0
228904,15,3,12059,812113,2,85.0,0,1,0,1,...,,,,,,0.0,0.0,0.0,0.0,0.0


In [15]:
full_dataset[full_dataset['naics_code']==0]

Unnamed: 0,week,visits_2020,postal_code,naics_code,raw_visitor_counts,median_dwell_2020,num_visitor_country_of_origin,num_visitor_home_cbgs,num_related_same_day_brand_2020,max_hourly_visits,...,closing_of_public_venues_pct,non-essential_services_closure_pct,school_closure_pct,shelter_in_place_pct,social_distancing_pct,naics_2_num_biz,naics_3_num_biz,naics_4_num_biz,naics_5_num_biz,naics_6_num_biz
854,10,67,98104,0,36,153.0,1,0,2,6,...,0.0,0.0,0.428571,0.0,0.571429,0.0,0.0,0.0,0.0,0.0
1048,10,565,30337,0,219,21.0,1,1,5,20,...,0.0,0.0,0.285714,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1349,10,19,11236,0,14,24.0,1,0,1,2,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1417,10,6,48204,0,5,188.5,0,0,2,1,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2953,10,43,30339,0,23,83.0,1,0,6,3,...,0.0,0.0,0.285714,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223083,15,5,98115,0,5,8.0,1,0,0,1,...,1.0,1.0,1.000000,1.0,1.000000,0.0,0.0,0.0,0.0,0.0
224348,15,12,48228,0,4,123.0,0,0,0,1,...,1.0,1.0,1.000000,1.0,1.000000,0.0,0.0,0.0,0.0,0.0
227134,15,7,30313,0,6,9.0,0,3,0,2,...,1.0,1.0,1.000000,1.0,1.000000,0.0,0.0,0.0,0.0,0.0
228222,15,9,30315,0,2,250.0,0,1,0,1,...,1.0,1.0,1.000000,1.0,1.000000,0.0,0.0,0.0,0.0,0.0
