# ACS data

Let's start by reading the data!

In [21]:
import pandas as pd
demo = pd.read_csv(r'..\data\raw\nyc_acs_demographics.csv')
demo.head()

Unnamed: 0.1,Unnamed: 0,geoid,acs_demog_var,value
0,0,360050001001,total_pop,7080.0
1,1,360050001001,two_or_more_races_pop,41.0
2,2,360050001001,vacant_housing_units_for_rent,0.0
3,3,360050001001,unemployed_pop,0.0
4,4,360050001001,black_pop,3984.0


The variables are stacked in the acs_demog_var column, we should unpack to different columns.

In [22]:
demo = acs.pivot(index='geoid', columns='acs_demog_var', values='value')
demo.describe()

acs_demog_var,aggregate_travel_time_to_work,amerindian_pop,asian_pop,black_pop,commuters_16_over,commuters_by_public_transportation,employed_pop,families_with_young_children,family_households,female_pop,...,other_race_pop,owner_occupied_housing_units_median_value,pop_16_over,renter_occupied_housing_units_paying_cash_median_gross_rent,total_pop,two_or_more_races_pop,unemployed_pop,vacant_housing_units,vacant_housing_units_for_rent,white_pop
count,362.0,6339.0,6339.0,6339.0,6339.0,6339.0,6339.0,6339.0,6339.0,6339.0,...,6339.0,4556.0,6339.0,5807.0,6339.0,6339.0,6339.0,6339.0,6339.0,6339.0
mean,34616.422652,2.380186,185.57517,295.618394,602.89888,352.085029,644.844613,98.536362,300.488405,702.815271,...,11.458116,687288.1,1091.11516,1523.152574,1343.245149,25.833728,47.946364,50.4761,12.062786,432.200663
std,20381.77003,13.583396,285.281508,436.053112,336.115082,248.154204,356.828817,87.249059,158.29653,358.60371,...,43.207546,383004.0,539.026034,535.556652,658.446589,43.511631,47.769236,68.955807,24.462099,486.881898
min,6450.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9999.0,0.0,229.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21785.0,0.0,10.0,7.0,397.0,191.0,425.0,39.0,200.0,472.0,...,0.0,443000.0,747.0,1232.0,918.0,0.0,15.0,11.0,0.0,46.0
50%,30877.5,0.0,72.0,79.0,552.0,310.0,588.0,79.0,278.0,648.0,...,0.0,598150.0,1013.0,1442.0,1249.0,9.0,36.0,34.0,0.0,281.0
75%,40933.75,0.0,237.5,451.0,751.5,463.0,800.0,136.0,374.0,863.0,...,0.0,828800.0,1332.0,1728.0,1648.0,35.0,67.0,65.0,17.0,674.0
max,195980.0,274.0,3960.0,4744.0,6483.0,4849.0,6767.0,1030.0,2250.0,5037.0,...,721.0,2000001.0,7881.0,3501.0,8830.0,766.0,575.0,1513.0,467.0,6111.0


In [23]:
demo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6339 entries, 360050001001 to 361190055004
Data columns (total 33 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   aggregate_travel_time_to_work                                362 non-null    float64
 1   amerindian_pop                                               6339 non-null   float64
 2   asian_pop                                                    6339 non-null   float64
 3   black_pop                                                    6339 non-null   float64
 4   commuters_16_over                                            6339 non-null   float64
 5   commuters_by_public_transportation                           6339 non-null   float64
 6   employed_pop                                                 6339 non-null   float64
 7   families_with_young_children                               

There is missing data in some of the columns

In [24]:
demo.columns[acs.isna().any()].tolist()

['aggregate_travel_time_to_work',
 'income_per_capita',
 'median_age',
 'median_income',
 'median_rent',
 'median_year_structure_built',
 'owner_occupied_housing_units_median_value',
 'renter_occupied_housing_units_paying_cash_median_gross_rent']

Different strategies can be used to fill this data, like using the mean of the total population. Since we have location data, it seems more interesting to fill with mean of the adjacent blocks. We'll exclude the aggregate_travel_time_to_work column since it's mostly empty,

In [25]:
import geopandas
import numpy as np
block = geopandas.read_file(r'..\data\raw\nyc_cbg_geoms.geojson')
block['geoid'] = block['geoid'].astype(np.int64)

In [26]:
demo = demo.drop(columns='aggregate_travel_time_to_work')
demo = block.merge(demo, on='geoid', how='inner')
demo = demo.set_index('geoid')

In [67]:
for index, geom in demo.iterrows():
    null_cols = geom[geom.isnull()].index.to_list()
    if null_cols:
        neighbors = demo[~demo.['geometry'].disjoint(geom['geometry'])].index.to_list()
        for col in null_cols:
            demo.at[index, col] = demo.loc[neighbors, col].mean()

In [76]:
print(acs.isna().sum(axis=0))
acs.columns[acs.isna().any()].tolist()

geometry                                                        0
amerindian_pop                                                  0
asian_pop                                                       0
black_pop                                                       0
commuters_16_over                                               0
commuters_by_public_transportation                              0
employed_pop                                                    0
families_with_young_children                                    0
family_households                                               0
female_pop                                                      0
hispanic_pop                                                    0
households                                                      0
housing_units                                                   0
housing_units_renter_occupied                                   0
income_per_capita                                               3
male_pop  

['income_per_capita',
 'median_age',
 'median_income',
 'median_rent',
 'median_year_structure_built',
 'owner_occupied_housing_units_median_value',
 'renter_occupied_housing_units_paying_cash_median_gross_rent']

There are still some missing values, we will fill them with the mean value later when we build the pipeline.