# Create census db

In [2]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

In [3]:
dbname = 'census'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/census


In [4]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


# Incarceration, race and income data

In [155]:
path = "/Users/along528/Dropbox/insight/project/datasets/census/incarceration/aff_download/"

csvs = {}
csvs['population_SF1_P1'] = path+'DEC_10_SF1_P1_with_ann.csv'
csvs['population_urban_rural_SF1_P2'] = path+'DEC_10_SF1_P2_with_ann.csv'
csvs['group_quarters_SF1_PCT20'] = path+'DEC_10_SF1_PCT20_with_ann.csv'
csvs['group_quarters_white_SF1_PCT20A'] = path+'DEC_10_SF1_PCT20A_with_ann.csv'
csvs['group_quarters_black_SF1_PCT20B'] = path+'DEC_10_SF1_PCT20B_with_ann.csv'
#csvs[''] = 'DEC_10_SF1_PCT20F_with_ann.csv'
#csvs[''] = 'DEC_10_SF1_PCT20G_with_ann.csv'
#csvs[''] = 'DEC_10_SF1_PCT23_with_ann.csv'
#csvs[''] = 'DEC_10_SF1_QTH1_with_ann.csv'
#csvs[''] = 'DEC_10_SF1_QTP1_with_ann.csv'
csvs['population_race_SF1_QTP3'] = path+'DEC_10_SF1_QTP3_with_ann.csv'
#csvs[''] = 'DEC_10_SF1_QTP5_with_ann.csv'
#csvs[''] = 'DEC_10_SF1_SF1DP1_with_ann.csv'
path = "/Users/along528/Dropbox/insight/project/datasets/census/income/aff_download/"

csvs['total_household_income_ACS_14_5YR_B19001'] = path+'ACS_14_5YR_B19001_with_ann.csv'
csvs['total_household_income_white_ACS_14_5YR_B19001A'] = path+'ACS_14_5YR_B19001A_with_ann.csv'
csvs['total_household_income_black_ACS_14_5YR_B19001B'] = path+'ACS_14_5YR_B19001B_with_ann.csv'

csvs['median_household_income_ACS_14_5YR_B19013'] = path+'ACS_14_5YR_B19013_with_ann.csv'
csvs['median_household_income_white_ACS_14_5YR_B19013A'] = path+'ACS_14_5YR_B19013A_with_ann.csv'
csvs['median_household_income_black_ACS_14_5YR_B19013B'] = path+'ACS_14_5YR_B19013B_with_ann.csv'

dfs = {}
for key in csvs:
    df = pd.read_csv(csvs[key])
    df['GEO.display-label'] = df['GEO.display-label'].map(lambda x: x.replace('ZCTA5 ',''))
    df['GEO.display-label'][0]='zipcode'
    df = df.drop(['GEO.id','GEO.id2'],axis=1)
    names = {}
    for col in df.columns.values:
        names[col] = df[col][0].replace(':','').lower()
    df.rename(columns=names,inplace=True)
    df = df[df.index!=0].reset_index(drop=True)
    df = df.set_index('zipcode')
    df = df.convert_objects(convert_numeric=True)
    df = df.fillna(0)
    dfs[key] = df




In [156]:
dfs['population_SF1_P1']

Unnamed: 0_level_0,total
zipcode,Unnamed: 1_level_1
00601,18570
00602,41520
00603,54689
00606,6615
00610,29016
00612,67010
00616,11017
00617,24597
00622,7853
00623,43061


In [157]:
dfs['population_urban_rural_SF1_P2'] = dfs['population_urban_rural_SF1_P2'][['urban','rural']]
dfs['population_urban_rural_SF1_P2']

Unnamed: 0_level_0,urban,rural
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1
00601,10679,7891
00602,41520,0
00603,54646,43
00606,2697,3918
00610,25640,3376
00612,62391,4619
00616,10438,579
00617,24161,436
00622,6171,1682
00623,42435,626


In [158]:
names = {}
names[u'institutionalized population (101-106, 201-203, 301, 401-405)'] = u'institutionalized'
names[u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106)'] = u'institutionalized_adult'
names[u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - federal detention centers (101)'] = u'institutionalized_adult_federal_detention'
names[u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - federal prisons (102)'] = u'institutionalized_adult_federal_prison'
names[u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - state prisons (103)'] = u'institutionalized_adult_state_prison'
names[u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - local jails and other municipal confinement facilities (104)'] = u'institutionalized_adult_local_jail'
names[u'institutionalized population (101-106, 201-203, 301, 401-405) - juvenile facilities (201-203)'] = u'institutionalized_juvenile'

for key in ['group_quarters_SF1_PCT20',
            'group_quarters_white_SF1_PCT20A',
            'group_quarters_black_SF1_PCT20B']:
    dfs[key].columns
    dfs[key] = dfs[key]\
           [[u'institutionalized population (101-106, 201-203, 301, 401-405)',
           u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106)',
           u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - federal detention centers (101)',
           u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - federal prisons (102)',
           u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - state prisons (103)',
           u'institutionalized population (101-106, 201-203, 301, 401-405) - correctional facilities for adults (101-106) - local jails and other municipal confinement facilities (104)',
           u'institutionalized population (101-106, 201-203, 301, 401-405) - juvenile facilities (201-203)']]
    dfs[key].rename(columns=names,inplace=True)
    dfs[key]


In [159]:
dfs['group_quarters_black_SF1_PCT20B']

Unnamed: 0_level_0,institutionalized,institutionalized_adult,institutionalized_adult_federal_detention,institutionalized_adult_federal_prison,institutionalized_adult_state_prison,institutionalized_adult_local_jail,institutionalized_juvenile
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00601,0,0,0,0,0,0,0
00602,0,0,0,0,0,0,0
00603,88,87,0,0,87,0,0
00606,0,0,0,0,0,0,0
00610,4,0,0,0,0,0,0
00612,8,6,0,0,6,0,0
00616,4,0,0,0,0,0,0
00617,3,0,0,0,0,0,0
00622,0,0,0,0,0,0,0
00623,1,0,0,0,0,0,0


In [160]:
dfs['population_race_SF1_QTP3'] = dfs['population_race_SF1_QTP3'][[u'number; race - total population - one race - white',
u'number; race - total population - one race - black or african american',
u'number; race - total population - one race - american indian and alaska native',
u'number; race - total population - one race - asian',
u'number; race - total population - one race - native hawaiian and other pacific islander',
u'number; race - total population - one race - some other race',
u'number; race - total population - two or more races']]
names = {}
names[u'number; race - total population - one race - white'] = u'population_white'
names[u'number; race - total population - one race - black or african american'] = u'population_black'
names[u'number; race - total population - one race - american indian and alaska native'] = u'population_native_american'
names[u'number; race - total population - one race - asian'] = u'population_asian'
names[u'number; race - total population - one race - native hawaiian and other pacific islander'] = u'population_nathaw'
names[u'number; race - total population - one race - some other race'] = u'population_other_race'
names[u'number; race - total population - two or more races'] = u'population_mult_race'
dfs['population_race_SF1_QTP3'].rename(columns=names,inplace=True)
dfs['population_race_SF1_QTP3']

Unnamed: 0_level_0,population_white,population_black,population_native_american,population_asian,population_nathaw,population_other_race,population_mult_race
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00601,17285,572,65,5,1,402,240
00602,35980,2210,129,22,0,2181,998
00603,45348,4141,140,85,4,3614,1357
00606,5883,314,27,3,8,280,100
00610,23796,2083,100,37,4,2175,821
00612,56488,4130,241,79,6,3964,2102
00616,9339,786,38,4,0,670,180
00617,19831,1885,81,30,0,2159,611
00622,6738,393,13,3,2,483,221
00623,36075,2365,130,35,7,3200,1249


In [161]:
names = {}
names[u'estimate; median household income in the past 12 months (in 2014 inflation-adjusted dollars)'] = 'median_income_estimate'
names[u'margin of error; median household income in the past 12 months (in 2014 inflation-adjusted dollars)'] = 'median_income_error'

for key in ['median_household_income_ACS_14_5YR_B19013',
             'median_household_income_white_ACS_14_5YR_B19013A',
             'median_household_income_black_ACS_14_5YR_B19013B']:
    
    dfs[key].rename(columns=names,inplace=True)


In [162]:
dfs['median_household_income_white_ACS_14_5YR_B19013A']

Unnamed: 0_level_0,median_income_estimate,median_income_error
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1
00601,10848,1537
00602,15906,1033
00603,16006,897
00606,14773,2261
00610,16813,1221
00612,17873,849
00616,15089,2270
00617,15621,1573
00622,14781,4186
00623,17182,1410


In [163]:
joined = pd.concat([dfs['population_SF1_P1'],
                    dfs['population_urban_rural_SF1_P2'],
                    dfs['group_quarters_SF1_PCT20'].rename(columns=lambda x: x+"_all"),
                    dfs['group_quarters_white_SF1_PCT20A'].rename(columns=lambda x: x+"_white"),
                    dfs['group_quarters_black_SF1_PCT20B'].rename(columns=lambda x: x+"_black"),
                    dfs['population_race_SF1_QTP3'],
                    dfs['median_household_income_ACS_14_5YR_B19013'].rename(columns=lambda x: x+"_all"),
                    dfs['median_household_income_white_ACS_14_5YR_B19013A'].rename(columns=lambda x: x+"_white"),
                    dfs['median_household_income_black_ACS_14_5YR_B19013B'].rename(columns=lambda x: x+"_black")
                   ],
                   axis=1)
joined = joined.reset_index()
joined

Unnamed: 0,zipcode,total,urban,rural,institutionalized_all,institutionalized_adult_all,institutionalized_adult_federal_detention_all,institutionalized_adult_federal_prison_all,institutionalized_adult_state_prison_all,institutionalized_adult_local_jail_all,...,population_asian,population_nathaw,population_other_race,population_mult_race,median_income_estimate_all,median_income_error_all,median_income_estimate_white,median_income_error_white,median_income_estimate_black,median_income_error_black
0,00601,18570,10679,7891,57,0,0,0,0,0,...,5,1,402,240,10833,1531,10848,1537,35294,18228
1,00602,41520,41520,0,12,0,0,0,0,0,...,22,0,2181,998,16353,977,15906,1033,17371,5466
2,00603,54689,54646,43,826,727,0,0,727,0,...,85,4,3614,1357,16323,872,16006,897,16346,9041
3,00606,6615,2697,3918,0,0,0,0,0,0,...,3,8,280,100,14138,2157,14773,2261,9412,6540
4,00610,29016,25640,3376,62,0,0,0,0,0,...,37,4,2175,821,17265,1065,16813,1221,13684,1611
5,00612,67010,62391,4619,185,69,0,0,69,0,...,79,6,3964,2102,17752,783,17873,849,19003,4045
6,00616,11017,10438,579,34,0,0,0,0,0,...,4,0,670,180,15115,2164,15089,2270,34286,39868
7,00617,24597,24161,436,28,0,0,0,0,0,...,30,0,2159,611,15461,1514,15621,1573,17297,3377
8,00622,7853,6171,1682,46,0,0,0,0,0,...,3,2,483,221,14993,3916,14781,4186,0,0
9,00623,43061,42435,626,48,0,0,0,0,0,...,35,7,3200,1249,17044,1186,17182,1410,20288,6691


In [164]:
joined.to_sql('census_by_zipcode',engine,if_exists='replace')