In [1]:
import pandas as pd


In [11]:
"""
File naming convension for ACS 5-year downloads:
state_year_[race|income].csv

"""

def get_filepath(state):
    return '../data/{state}/'.format(state=state)

def get_filename(state, year, dataset_type):
    return get_filepath(state) + '{state}_{year}_{dataset_type}.csv'.format(
        state=state, year=year, dataset_type=dataset_type)


# geoid is the column we join data on
geoid_column_name = 'geoid'

geoid_column_map = {
    'GEO.id2': geoid_column_name,
    'GEO.display-label': 'geoid display label'
}

race_column_map = {
    'HD01_VD01': 'race: total people',
    'HD02_VD01': 'race: total people margin of error',
    'HD01_VD02': 'race: White',
    'HD01_VD03': 'race: Black',
    'HD01_VD04': 'race: American Indian and Alaska',
    'HD01_VD05': 'race: Asian',
    'HD01_VD06': 'race: Native Hawaiian and Other',
    'HD01_VD07': 'race: Other',
    'HD01_VD08': 'race: 2 or more races'
}

income_column_map = {
    'HD01_VD01': 'median income',
    'HD02_VD01': 'median income margin of error'
}


def preprocess_df(df, year, column_map):
    # prune data
    # rename columns
    # drop the first row (the first row is a display label)
    df.drop([0], inplace=True)
    column_name_map = {key: str(year) + ' ' + value for key, value in column_map.items()}
    column_name_map.update(geoid_column_map)
    cols_to_drop = [col for col in df.columns if not col in column_name_map.keys()]
    df.drop(cols_to_drop,  axis=1,  inplace=True)
    df.rename(columns=column_name_map, inplace=True)
#     df.set_index(geoid_column_name, inplace=True)
    df.set_index(list(geoid_column_map.values()), inplace=True)
    return df


def preprocess_race_df(df, year):
    return preprocess_df(df, year, race_column_map)


def preprocess_income_df(df, year):
    return preprocess_df(df, year, income_column_map)


In [12]:
race_df = pd.read_csv(get_filename('ny', '2017', 'race'))
race_df = preprocess_race_df(race_df, 2017)

income_df = pd.read_csv(get_filename('ny', '2017', 'income'))
income_df = preprocess_income_df(income_df, 2017)


new_df = pd.concat([race_df, income_df], axis=1, join='inner')
new_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,2017 race: total people,2017 race: total people margin of error,2017 race: White,2017 race: Black,2017 race: American Indian and Alaska,2017 race: Asian,2017 race: Native Hawaiian and Other,2017 race: Other,2017 race: 2 or more races,2017 median income,2017 median income margin of error
geoid,geoid display label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
36001000100,"Census Tract 1, Albany County, New York",1939,260,502,1142,97,16,0,15,167,27250,10262
36001000200,"Census Tract 2, Albany County, New York",4731,676,710,3474,0,104,0,32,411,24091,6623
36001000300,"Census Tract 3, Albany County, New York",5558,646,2589,1539,18,559,0,273,580,33454,15095
36001000401,"Census Tract 4.01, Albany County, New York",2459,266,2183,131,20,92,0,8,25,74255,7279
36001000403,"Census Tract 4.03, Albany County, New York",4695,456,3256,714,27,416,0,52,230,70815,10428


In [13]:
# We merge data into the income df


def add_df(df1, df2):
    # comebine the df's on geoid
    return pd.concat([df1, df2], axis=1, join='inner')

states = ['ny']
years = [str(yr) for yr in range(2012, 2018)]

states_dfs = {}

for state in states:
    df = None
    for year in years:
        income_filename = get_filename(state, year, 'income')
        income_df = pd.read_csv(income_filename)
        income_df = preprocess_income_df(income_df, year)
        if df is None:
            df = income_df
        else:
            df = add_df(df, income_df)
               
        race_filename = get_filename(state, year, 'race')
        race_df = pd.read_csv(race_filename)
        race_df = preprocess_race_df(race_df, year)
        df = add_df(df, race_df)

    states_dfs[state] = df

print(len(states_dfs))
# states_dfs[0].head()

1


In [15]:
# Save the giant dataframes to CSV
for state, df in states_dfs.items():
    filename = get_filepath(state) + 'race_and_income_data.csv'
    df.to_csv(filename)

In [18]:
import geopandas as gpd

In [19]:
shapefile_filename = get_filepath('ny') + 'nyc_census_tracts.shp'
gdf = gpd.read_file(shapefile_filename)
print (gdf)

DriverError: Unable to open ../data/ny/nyc_census_tracts.shx or ../data/ny/nyc_census_tracts.SHX. Set SHAPE_RESTORE_SHX config option to YES to restore or create it.