In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# MEDIAN HOUSEHOLD INCOME (IN 2017 INFLATION-ADJUSTED DOLLARS) - United States 
# https://factfinder.census.gov/faces/nav/jsf/pages/download_center.xhtml
# "I know the dataset or table(s) that I want to download.": "American Community Survey": "2017 ACS 5 Year Estimates":
# "State - 040": "All states within United States & Puerto Rico": Refine by "income": GCT1901

# https://www.economy.com/support/blog/buffet.aspx?did=932EBFA8-D905-4945-A5D5-D02D98113FA4
# Median family income is typically higher than median household income because of the composition of households. 
# Family households tend to have more people, and more of those members are in their prime earning years; 
# as contrasted with members who have lesser incomes because they are very young or elderly.

In [3]:
df_income = pd.read_csv('../data/sources/ACS_17_5YR_GCT1901.US13PR_with_ann.csv', 
                         usecols=['Target Geo Id2', 'Geographic Area', 'Geographic Area.1', 'Dollar'])
df_income = df_income.rename(str.lower, axis='columns')
df_income.columns = df_income.columns.str.replace(' ','_')

In [4]:
# state_fip ranges from 1 (ALabama) - 56 (Wyoming)
df_states_income = df_income[df_income['target_geo_id2'] <= 56]

df_states_income = df_states_income.drop(['geographic_area', 'geographic_area.1'], axis=1)
df_states_income = df_states_income.rename({'dollar': 'median_hh_income', 'target_geo_id2': 'state_fip'}, axis=1)
df_states_income = df_states_income.astype({'median_hh_income': int})

In [5]:
#drop states from city income:
df_cities_income = df_income.rename({'target_geo_id2': 'state_city_id', 'dollar': 'median_hh_income'}, axis=1)
df_cities_income = df_cities_income[df_cities_income['state_city_id'] > 56]
df_cities_income = df_cities_income.astype({'state_city_id': str})

In [6]:
pickle.dump(df_states_income, open('../data/df_states_income.pkl', 'wb'))
pickle.dump(df_cities_income, open('../data/df_cities_income.pkl', 'wb'))