# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Data exploration

In [2]:
census = pd.read_csv("../data/census_zipcode_level.csv");

In [3]:
census.head()

Unnamed: 0,Zip,State,Population,White,Black,Native,Asian,Islander,Other,Two,...,Families,Married_couple_families,Married_couple_child_under_18,Married_couple_no_child_under_18,Single_parent_families,Single_parent_child_under_18,Single_parent_no_child_under_18,Non_families,Househoulder_living_alone,Householder_living_with_unrelated
0,35004,Alabama,10418,8495,1657,0,198,0,0,42,...,3031,2265,820,1445,766,351,415,1194,1090,104
1,35005,Alabama,7708,4432,3130,2,0,2,0,26,...,2103,1540,530,1010,563,229,334,824,707,117
2,35006,Alabama,3099,2849,205,9,0,0,0,15,...,857,675,261,414,182,73,109,332,314,18
3,35007,Alabama,26630,19673,3210,0,251,10,28,416,...,6680,5457,2533,2924,1223,604,619,2055,1802,253
4,35010,Alabama,20826,12247,7048,93,203,0,0,327,...,5605,3586,1088,2498,2019,1121,898,2302,2112,190


In [4]:
census.columns

Index(['Zip', 'State', 'Population', 'White', 'Black', 'Native', 'Asian',
       'Islander', 'Other', 'Two', 'Hispanic', 'Female_0to9_Years',
       'Female_10to19_Years', 'Female_20to29_Years', 'Female_30to39_Years',
       'Female_40to49_Years', 'Female_50to59_Years', 'Female_60to69_Years',
       'Female_70p_Years', 'Male_0to9_Years', 'Male_10to19_Years',
       'Male_20to29_Years', 'Male_30to39_Years', 'Male_40to49_Years',
       'Male_50to59_Years', 'Male_60to69_Years', 'Male_70p_Years',
       'Median_household_inc', 'Households', 'Less_than_24999',
       'From_25000_to_49999', 'From_50000_to_74999', 'From_75000_to_99999',
       'From_100000_to_149999', 'From_150000_to_199999', 'From_200000_or_more',
       'Population_25_Years_and_Over', 'No_Diploma_pct', 'High_School_pct',
       'Some_College_pct', 'Bachelors_Degree_pct', 'Graduate_Degree_pct',
       'Family_Households', 'Family_Poverty_pct', 'labor_over_16_years',
       'Unemployment_Rate_pct', 'Housing_Units', 'Household

Looks like we have:
* population density
* race 
* gender
* gender x age
* household income
* education
* family poverty
* workforce participation
* unemployment
* family types

In [5]:
# check data types
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32989 entries, 0 to 32988
Data columns (total 58 columns):
Zip                                  32989 non-null int64
State                                32989 non-null object
Population                           32989 non-null int64
White                                32989 non-null int64
Black                                32989 non-null int64
Native                               32989 non-null int64
Asian                                32989 non-null int64
Islander                             32989 non-null int64
Other                                32989 non-null int64
Two                                  32989 non-null int64
Hispanic                             32989 non-null int64
Female_0to9_Years                    32989 non-null int64
Female_10to19_Years                  32989 non-null int64
Female_20to29_Years                  32989 non-null int64
Female_30to39_Years                  32989 non-null int64
Female_40to49_Years 

In [6]:
# change data types
cols_to_change = ['No_Diploma_pct','High_School_pct','Some_College_pct','Bachelors_Degree_pct',
                  'Graduate_Degree_pct','Family_Poverty_pct','Unemployment_Rate_pct']
for col in cols_to_change:
    census.loc[census[col] == ' -   '] = np.NAN
    census[col] = census[col].astype('float')

In [7]:
# check to make sure it's all good now
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32989 entries, 0 to 32988
Data columns (total 58 columns):
Zip                                  26733 non-null float64
State                                26733 non-null object
Population                           26733 non-null float64
White                                26733 non-null float64
Black                                26733 non-null float64
Native                               26733 non-null float64
Asian                                26733 non-null float64
Islander                             26733 non-null float64
Other                                26733 non-null float64
Two                                  26733 non-null float64
Hispanic                             26733 non-null float64
Female_0to9_Years                    26733 non-null float64
Female_10to19_Years                  26733 non-null float64
Female_20to29_Years                  26733 non-null float64
Female_30to39_Years                  26733 non-null 

# Aggregate by 3-digit zip code

In [8]:
census['zip_str'] = census.Zip.astype(str)
census['zip_3'] = census.zip_str.str.slice(0, 3)

In [9]:
print('Number of unique 3 digit zip codes: ', len(set(census.zip_3)), 
      '\nNumber of unqiue 5 digit zip codes: ', len(set(census.Zip)), sep='')

Number of unique 3 digit zip codes: 841
Number of unqiue 5 digit zip codes: 32989


`census.zip_3` is now the column to aggregate on.

Need to handle percent and aggregate columns differently. I'll just gather the percent columns here and later I'll aggregate them differently:
* sum columns: variable sum/total population
* percent columns: variable sum/number of 5-digit zip codes in each 3-digit zip code

In [10]:
pct_cols = []
print('Columns that are percents:')
for col in census.columns:
    if col[-3:] == 'pct':
        print(col)
        pct_cols.append(col)

Columns that are percents:
No_Diploma_pct
High_School_pct
Some_College_pct
Bachelors_Degree_pct
Graduate_Degree_pct
Family_Poverty_pct
Unemployment_Rate_pct


In [11]:
census_zip3_aggregated = census.groupby(['zip_3'])[census.columns ^ ['State','Zip','zip_str','zip_3']].agg(['sum','count']).reset_index()

In [12]:
census_zip3_aggregated_new = pd.DataFrame()

# calculate %s by 3-digit zip
for col in census.columns ^ ['zip_3','Population','State','Zip','zip_str']:
    census_zip3_aggregated_new[col] = census_zip3_aggregated[col]['sum']/census_zip3_aggregated['Population']['sum']

# adjust percent columns
for col in pct_cols:
    census_zip3_aggregated_new[col] = census_zip3_aggregated[col]['sum']/(100*census_zip3_aggregated['Population']['count'])
    

In [13]:
# define a measure for % nonwhite
census_zip3_aggregated_new['nonwhite_pct'] = 1 - census_zip3_aggregated_new['White']

In [14]:
# quick peak
census_zip3_aggregated_new.sample(5)

Unnamed: 0,Asian,Bachelors_Degree_pct,Black,Families,Family_Households,Family_Poverty_pct,Female_0to9_Years,Female_10to19_Years,Female_20to29_Years,Female_30to39_Years,...,Population_25_Years_and_Over,Single_parent_child_under_18,Single_parent_families,Single_parent_no_child_under_18,Some_College_pct,Two,Unemployment_Rate_pct,White,labor_over_16_years,nonwhite_pct
148,0.060202,0.080437,0.047555,0.253225,0.253225,0.246312,0.063554,0.098524,0.061172,0.044815,...,0.647892,0.024283,0.048019,0.023735,0.1715,0.018613,0.137875,0.838849,0.37422,0.161151
643,0.038425,0.149421,0.118953,0.247706,0.247706,0.111228,0.073486,0.070407,0.065699,0.070224,...,0.638776,0.035146,0.064783,0.029637,0.318895,0.016714,0.077579,0.485092,0.494089,0.514908
40,0.01236,0.135654,0.024814,0.262096,0.262096,0.068904,0.051342,0.062765,0.054863,0.052316,...,0.710592,0.028693,0.053765,0.025072,0.318942,0.012683,0.056481,0.905538,0.518994,0.094462
173,0.033381,0.187063,0.27615,0.239299,0.239299,0.13925,0.061572,0.064618,0.084004,0.067543,...,0.663954,0.048698,0.085184,0.036486,0.270813,0.021701,0.081937,0.554319,0.510777,0.445681
592,0.008816,0.111976,0.190096,0.255258,0.255258,0.14439,0.066987,0.061566,0.073637,0.06921,...,0.671752,0.040045,0.071048,0.031003,0.263122,0.020817,0.072317,0.734165,0.461466,0.265835


# Save to new csv

In [15]:
census_zip3_aggregated_new.to_csv("../data/census_zip3.csv")