# Calculate Data for the Surroundings of each zip code.

## 0. Initialize

In [66]:
#Libraries and Settings
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',10)

In [59]:
#Data File to Dataframe
file_1='/Users/c32/Documents/NYCDSA/Projects/DATA/Ready_Data/2_Merge_Census.csv'
file_2='/Users/c32/Documents/NYCDSA/Projects/DATA/Ready_Data/2_Merge_Irs.csv'
file_3='/Users/c32/Documents/NYCDSA/Projects/DATA/Ready_Data/1_Load_Walmart.csv'
file_4='/Users/c32/Documents/NYCDSA/Projects/DATA/Ready_Data/1_Load_Costco.csv'
file_5='/Users/c32/Documents/NYCDSA/Projects/DATA/Ready_Data/1_Load_Geographic_Data.csv'

census=pd.read_csv(file_1, converters={'zip': str})
irs=pd.read_csv(file_2, converters={'zip': str})
walmart=pd.read_csv(file_3, converters={'zip': str})
costco=pd.read_csv(file_4, converters={'zip': str})
geo=pd.read_csv(file_5, converters={'zip': str})

print('census: ',census.shape)
print('irs: ', irs.shape)
print('walmart: ',walmart.shape)
print('costco: ',costco.shape)
print('geo: ', geo.shape)

census:  (33971, 127)
irs:  (27935, 28)
walmart:  (4654, 9)
costco:  (604, 5)
geo:  (33129, 12)


## 1. Get List of Zip Codes
It is necessary to perform the calculations only for those zip codes where there is a store and not for every single existing zip code. 

In [60]:
zip_codes = set(walmart['zip']).union(set(costco['zip']))
len(zip_codes) #4472 This is smaller than the total of stores, meaning there are more than one walmart and costo on the same zip code

#Are there any zip codes that are not present in the master?
master_zip_codes=set(geo['zip'])
print(zip_codes-master_zip_codes)


set()


## 2. Calculate the Appropriate Aggregate functions for each metric
In this part the following is done:
1. Go through every single zip code that has a store.
2. Bring the Data for all the surrounding zip codes that are within 9 miles.
3. Aggregate it into 1 line.
4. Put together all those lines and save the results

### 2.1 Aggregate Metrics for Census

In [61]:
# Will use this empty dataframe to add the summarized data from the surroundings in the calculation for each zip code using a loop
agg_census=pd.DataFrame()

for zip_i in zip_codes:
    #this brings the list of surounding zips for that particular zip_i 
    surr_zip = geo[geo['zip']==zip_i]['zips_around'].values[0]
    surr_zip=surr_zip.replace('[','').replace(']','').replace('\'','').split(', ')

    #keep only the ones in the list in this temporary df
    t_census=pd.DataFrame()
    t_census=census[census['zip'].isin(surr_zip)]
    t_census=t_census.dropna()

    #This dictionary will contain 1 line for the calculated data for zip_i and will be added to a df when finished with it.
    t_agg_census={'zip':zip_i}

    #now let's obtain the new data for zip_i
    columns=t_census.columns.tolist()
    for c in (columns[1:]):
        if ('delta' not in c)and('ratio' not in c):
            t_agg_census[c]=t_census[c].sum()
        if ('ratio' in c)and('delta' not in c):
            c_name=c.replace('_ratio','')
            if('21' in c):
                t_agg_census[c]=t_agg_census[c_name]/t_agg_census['total_pop_21']
            if('11' in c):
                t_agg_census[c]=t_agg_census[c_name]/t_agg_census['total_pop_11']
        if ('ratio' not in c)and('delta' in c):
            c_name=c.replace('delta_','')
            t_agg_census[c]=(t_agg_census[c_name+'_21'])-(t_agg_census[c_name+'_11'])
        if ('ratio' in c)and('delta' in c):
            c_name=c.replace('_ratio','').replace('delta_','')
            t_agg_census[c]=(t_agg_census[c_name+'_21']-t_agg_census[c_name+'_11'])/t_agg_census['total_pop_11']

    agg_census=pd.concat([agg_census,pd.DataFrame(t_agg_census, index=[0])])  
    

### 2.2 Aggregate Metrics for IRS

In [62]:
#Use this empty dataframe to add in the calculation for each zip code using a loop
agg_irs=pd.DataFrame()

for zip_i in zip_codes:
    #this brings the list of surounding zips for that particular zip_i 
    surr_zip = geo[geo['zip']==zip_i]['zips_around'].values[0]
    surr_zip=surr_zip.replace('[','').replace(']','').replace('\'','').split(', ')

    #keep only the ones in the list in this temporary df
    t_irs=pd.DataFrame()
    t_irs=irs[irs['zip'].isin(surr_zip)]
    t_irs=t_irs.dropna()

    #This will contain the calculated data for zip_i and will be added when finished with it.
    t_agg_irs={'zip':zip_i}
    #now let's obtain the new data for zip_i

    columns=t_irs.columns.tolist()
    for c in (columns[1:]):
        if ('delta' not in c)and('ratio' not in c)and('_income' not in c):
            t_agg_irs[c]=t_irs[c].sum()
        if ('_income' in c):
            #Let's go to each row:
            sum_weighted_income=0
            for i in t_irs.index:
                sum_weighted_income+= t_irs.loc[i][c]*t_irs.loc[i]['total_returns_21']
                sum_weights=t_irs['total_returns_21'].sum()
                if sum_weights == 0 or np.isnan(sum_weights):
                    t_agg_irs[c]=np.nan
                else:
                    t_agg_irs[c]=sum_weighted_income/sum_weights

        if ('ratio' in c)and('delta' not in c):
            c_name=c.replace('_ratio','')
            if('21' in c):
                #Let's go to each row:
                sum_weighted_ratio=0
                for i in t_irs.index:
                    sum_weighted_ratio+= t_irs.loc[i][c]*t_irs.loc[i]['total_returns_21']
                sum_weights=t_irs['total_returns_21'].sum()
                if sum_weights == 0 or np.isnan(sum_weights):
                    t_agg_irs[c]=np.nan
                else:
                    t_agg_irs[c]=sum_weighted_ratio/sum_weights
            if('11' in c):
                #Let's go to each row:
                sum_weighted_ratio=0
                for i in t_irs.index:
                    sum_weighted_ratio+= t_irs.loc[i][c]*t_irs.loc[i]['total_returns_11']
                sum_weights=t_irs['total_returns_11'].sum()
                if sum_weights == 0 or np.isnan(sum_weights):
                        t_agg_irs[c]=np.nan
                else:
                    t_agg_irs[c]=sum_weighted_ratio/sum_weights
        if ('ratio' not in c)and('delta' in c):
            c_name=c.replace('delta_','')
            t_agg_irs[c]=(t_agg_irs[c_name+'_21'])-(t_agg_irs[c_name+'_11'])
        if ('ratio' in c)and('delta' in c):
            c_name=c.replace('delta_','')
            t_agg_irs[c]=t_agg_irs[c_name+'_21']-t_agg_irs[c_name+'_11']

    agg_irs=pd.concat([agg_irs,pd.DataFrame(t_agg_irs, index=[0])])

         

### *!! A few NaNs were created for stores in Puerto Rico because there was no IRS info available.*
Probably those stores will be kept out of the study in next steps.


In [64]:
#Check for nulls
print(sum(agg_census.isna().sum()))

#Nulls for PR zip codes
print(sum(agg_irs.isna().sum()))

0
528


## 3. Save the Calculated Data

In [71]:
import os
outname = '2_Census_Surrounding_Data.csv'
outdir = '/Users/c32/Documents/NYCDSA/Projects/DATA/Ready_Data'
if not os.path.exists(outdir):
    os.mkdir(outdir)
fullname = os.path.join(outdir, outname)    

agg_census.to_csv(fullname, header=True, index=False)
print("Saved!")

import os
outname = '2_Irs_Surrounding_Data.csv'
outdir = '/Users/c32/Documents/NYCDSA/Projects/DATA/Ready_Data'
if not os.path.exists(outdir):
    os.mkdir(outdir)
fullname = os.path.join(outdir, outname)    

agg_irs.to_csv(fullname, header=True, index=False)
print("Saved!")

Saved!
Saved!
