# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Data exploration

In [2]:
census = pd.read_csv("../data/census_zipcode_level.csv");

In [3]:
census.head()

Unnamed: 0,Zip,State,Population,White,Black,Native,Asian,Islander,Other,Two,...,Families,Married_couple_families,Married_couple_child_under_18,Married_couple_no_child_under_18,Single_parent_families,Single_parent_child_under_18,Single_parent_no_child_under_18,Non_families,Househoulder_living_alone,Householder_living_with_unrelated
0,35004,Alabama,10418,8495,1657,0,198,0,0,42,...,3031,2265,820,1445,766,351,415,1194,1090,104
1,35005,Alabama,7708,4432,3130,2,0,2,0,26,...,2103,1540,530,1010,563,229,334,824,707,117
2,35006,Alabama,3099,2849,205,9,0,0,0,15,...,857,675,261,414,182,73,109,332,314,18
3,35007,Alabama,26630,19673,3210,0,251,10,28,416,...,6680,5457,2533,2924,1223,604,619,2055,1802,253
4,35010,Alabama,20826,12247,7048,93,203,0,0,327,...,5605,3586,1088,2498,2019,1121,898,2302,2112,190


In [4]:
census.columns

Index(['Zip', 'State', 'Population', 'White', 'Black', 'Native', 'Asian',
       'Islander', 'Other', 'Two', 'Hispanic', 'Female_0to9_Years',
       'Female_10to19_Years', 'Female_20to29_Years', 'Female_30to39_Years',
       'Female_40to49_Years', 'Female_50to59_Years', 'Female_60to69_Years',
       'Female_70p_Years', 'Male_0to9_Years', 'Male_10to19_Years',
       'Male_20to29_Years', 'Male_30to39_Years', 'Male_40to49_Years',
       'Male_50to59_Years', 'Male_60to69_Years', 'Male_70p_Years',
       'Median_household_inc', 'Households', 'Less_than_24999',
       'From_25000_to_49999', 'From_50000_to_74999', 'From_75000_to_99999',
       'From_100000_to_149999', 'From_150000_to_199999', 'From_200000_or_more',
       'Population_25_Years_and_Over', 'No_Diploma_pct', 'High_School_pct',
       'Some_College_pct', 'Bachelors_Degree_pct', 'Graduate_Degree_pct',
       'Family_Households', 'Family_Poverty_pct', 'labor_over_16_years',
       'Unemployment_Rate_pct', 'Housing_Units', 'Household

Looks like we have:
* population density
* race 
* gender
* gender x age
* household income
* education
* family poverty
* workforce participation
* unemployment
* family types

In [5]:
# check data types
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32989 entries, 0 to 32988
Data columns (total 58 columns):
Zip                                  32989 non-null int64
State                                32989 non-null object
Population                           32989 non-null int64
White                                32989 non-null int64
Black                                32989 non-null int64
Native                               32989 non-null int64
Asian                                32989 non-null int64
Islander                             32989 non-null int64
Other                                32989 non-null int64
Two                                  32989 non-null int64
Hispanic                             32989 non-null int64
Female_0to9_Years                    32989 non-null int64
Female_10to19_Years                  32989 non-null int64
Female_20to29_Years                  32989 non-null int64
Female_30to39_Years                  32989 non-null int64
Female_40to49_Years 

In [6]:
# change data types
cols_to_change = ['No_Diploma_pct','High_School_pct','Some_College_pct','Bachelors_Degree_pct',
                  'Graduate_Degree_pct','Family_Poverty_pct','Unemployment_Rate_pct']
for col in cols_to_change:
    census.loc[census[col] == ' -   '] = np.NAN
    census[col] = census[col].astype('float')

In [7]:
# check to make sure it's all good now
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32989 entries, 0 to 32988
Data columns (total 58 columns):
Zip                                  26733 non-null float64
State                                26733 non-null object
Population                           26733 non-null float64
White                                26733 non-null float64
Black                                26733 non-null float64
Native                               26733 non-null float64
Asian                                26733 non-null float64
Islander                             26733 non-null float64
Other                                26733 non-null float64
Two                                  26733 non-null float64
Hispanic                             26733 non-null float64
Female_0to9_Years                    26733 non-null float64
Female_10to19_Years                  26733 non-null float64
Female_20to29_Years                  26733 non-null float64
Female_30to39_Years                  26733 non-null 

# Aggregate by 3-digit zip code

In [8]:
census['zip_str'] = census.Zip.astype('str')
census['zip_3'] = census.zip_str.str.replace('\.0','')
census['zip_3'].loc[census['zip_3']!='nan'] = census['zip_3'].loc[census['zip_3']!='nan'].str.pad(width=5, side='left', fillchar='0')
census['zip_3'] = census.zip_3.str.slice(0,3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [9]:
print('Number of unique 3 digit zip codes: ', len(set(census.zip_3)), 
      '\nNumber of unqiue 5 digit zip codes: ', len(set(census.Zip)), sep='')

Number of unique 3 digit zip codes: 885
Number of unqiue 5 digit zip codes: 32989


`census.zip_3` is now the column to aggregate on. There are supposedly [929 unique 3-digit zip codes in the US](https://healthyalgorithms.com/2015/02/13/how-many-3-digit-zip-codes-are-there/), so we seem to be missing a few here.

Need to handle percent and aggregate columns differently. I'll just gather the percent columns here and later I'll aggregate them differently:
* sum columns: variable sum/total population
* percent columns: variable sum/number of 5-digit zip codes in each 3-digit zip code

In [10]:
pct_cols = []
print('Columns that are percents:')
for col in census.columns:
    if col[-3:] == 'pct':
        print(col)
        pct_cols.append(col)

Columns that are percents:
No_Diploma_pct
High_School_pct
Some_College_pct
Bachelors_Degree_pct
Graduate_Degree_pct
Family_Poverty_pct
Unemployment_Rate_pct


In [11]:
census_zip3_aggregated = census.groupby(['zip_3'])[census.columns ^ ['State','Zip','zip_str']].agg(['sum','count'])

In [12]:
census_zip3_aggregated_new = pd.DataFrame()

# calculate %s by 3-digit zip
for col in census.columns ^ ['zip_3','Population','State','Zip','zip_str','zip_3']:
    census_zip3_aggregated_new[col] = census_zip3_aggregated[col]['sum']/census_zip3_aggregated['Population']['sum']

# adjust percent columns
for col in pct_cols:
    census_zip3_aggregated_new[col] = census_zip3_aggregated[col]['sum']/(100*census_zip3_aggregated['Population']['count'])
    

In [13]:
# define a measure for % nonwhite
census_zip3_aggregated_new['nonwhite_pct'] = 1 - census_zip3_aggregated_new['White']

In [14]:
# quick peak
census_zip3_aggregated_new.sample(5)

Unnamed: 0_level_0,Asian,Bachelors_Degree_pct,Black,Families,Family_Households,Family_Poverty_pct,Female_0to9_Years,Female_10to19_Years,Female_20to29_Years,Female_30to39_Years,...,Population_25_Years_and_Over,Single_parent_child_under_18,Single_parent_families,Single_parent_no_child_under_18,Some_College_pct,Two,Unemployment_Rate_pct,White,labor_over_16_years,nonwhite_pct
zip_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
149,0.005185,0.12075,0.091243,0.226797,0.226797,0.1575,0.066145,0.066227,0.064451,0.05941,...,0.662191,0.052573,0.077713,0.02514,0.30925,0.038918,0.06075,0.817575,0.435188,0.182425
660,0.027399,0.162933,0.04286,0.248919,0.248919,0.077733,0.067579,0.071299,0.069454,0.06644,...,0.632758,0.030314,0.049041,0.018727,0.33005,0.029048,0.054833,0.824286,0.530824,0.175714
351,0.006566,0.115857,0.13966,0.266728,0.266728,0.104743,0.060995,0.064682,0.059132,0.06267,...,0.68789,0.027939,0.059631,0.031693,0.319971,0.01241,0.072286,0.793788,0.469353,0.206212
56,0.006862,0.2122,0.007315,0.256982,0.256982,0.0728,0.053236,0.059267,0.056466,0.06038,...,0.704596,0.036869,0.057493,0.020624,0.2622,0.017101,0.04244,0.946507,0.560215,0.053493
238,0.015989,0.121047,0.370747,0.253667,0.253667,0.105907,0.058266,0.063675,0.061387,0.062295,...,0.689493,0.041817,0.078309,0.036492,0.297279,0.021865,0.074721,0.534311,0.488765,0.465689


# Save to new csv

In [15]:
census_zip3_aggregated_new.to_csv("../data/census_zip3.csv", index=True)