In [14]:
import numpy as np
import pandas as pd
import itertools


[data source](https://www.census.gov/programs-surveys/acs/data/pums.html)

In [62]:
#data sample as numpy array, first three rows of 2015 New York PUMS
with open("ss15hny.csv") as t_in:
    np_data = np.genfromtxt(itertools.islice(t_in, 3), names=True, delimiter=',', dtype=object)

In [80]:
#entire csv as panda dataframe
df = pd.read_csv("ss15hny.csv",header=0)
# question... what is default missing data representation
# what does later summary functions (mean) do with missing cells?

In [20]:
#column headers
print np_data.dtype.names


('RT', 'SERIALNO', 'DIVISION', 'PUMA', 'REGION', 'ST', 'ADJHSG', 'ADJINC', 'WGTP', 'NP', 'TYPE', 'ACCESS', 'ACR', 'AGS', 'BATH', 'BDSP', 'BLD', 'BROADBND', 'BUS', 'COMPOTHX', 'CONP', 'DIALUP', 'DSL', 'ELEP', 'FIBEROP', 'FS', 'FULP', 'GASP', 'HANDHELD', 'HFL', 'INSP', 'LAPTOP', 'MHP', 'MODEM', 'MRGI', 'MRGP', 'MRGT', 'MRGX', 'OTHSVCEX', 'REFR', 'RMSP', 'RNTM', 'RNTP', 'RWAT', 'RWATPR', 'SATELLITE', 'SINK', 'SMP', 'STOV', 'TEL', 'TEN', 'TOIL', 'VACS', 'VALP', 'VEH', 'WATP', 'YBL', 'FES', 'FFINCP', 'FGRNTP', 'FHINCP', 'FINCP', 'FPARC', 'FPLMPRP', 'FSMOCP', 'GRNTP', 'GRPIP', 'HHL', 'HHT', 'HINCP', 'HOTWAT', 'HUGCL', 'HUPAC', 'HUPAOC', 'HUPARC', 'KIT', 'LNGI', 'MULTG', 'MV', 'NOC', 'NPF', 'NPP', 'NR', 'NRC', 'OCPIP', 'PARTNER', 'PLM', 'PLMPRP', 'PSF', 'R18', 'R60', 'R65', 'RESMODE', 'SMOCP', 'SMX', 'SRNT', 'SSMC', 'SVAL', 'TAXP', 'WIF', 'WKEXREL', 'WORKSTAT', 'FACCESSP', 'FACRP', 'FAGSP', 'FBATHP', 'FBDSP', 'FBLDP', 'FBROADBNDP', 'FBUSP', 'FCOMPOTHXP', 'FCONP', 'FDIALUPP', 'FDSLP', 'FELEP',


What are these columns? 
[Codes for column headers and cell data](https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMSDataDict15.pdf)

Each row is a household or family. Which metrics do we want on the map and which data could compose them?
 
* Income
    + FINCP (family income)
    + ADJINC (Use ADJINC to adjust FINCP to constant dollars. But it seems to be the same for all data in this sample.)
* Rent burden
    + GRNTP (gross rent)
* Race
    + RAC1P (race, i.e. Black White American Indian)
    + ANC1P (ancestry, i.e. Mexican, Mexican American, or Hispanic)
* Location
    + PUMA (Public use microdata area code) 
    + ST (state)

[Guide to PUMS](https://www.census.gov/content/dam/Census/library/publications/2009/acs/ACSPUMS.pdf)

[More documentation](https://www2.census.gov/programs-surveys/acs/tech_docs/pums/ACS2015_PUMS_README.pdf)

[PUMA shapefiles download link from NYU](https://geo.nyu.edu/catalog/nyu_2451_34562)

[PUMAs mapped on Carto](https://aprilrabkin.carto.com/builder/b10e6e90-926b-11e7-a56e-0ee462b5436c)

[How to join PUMAs geography and their data in Carto, blog by Frank Donnelly, Geospatial Data Librarian, Baruch College CUNY](http://faculty.baruch.cuny.edu/geoportal/resources/other/cartodb_import_data.pdf)

[My Carto project](https://aprilrabkin.carto.com/builder/b10e6e90-926b-11e7-a56e-0ee462b5436c)

[Pandas cheatsheet](https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf)

In [21]:
print df.shape #rows and columns

(92817, 235)


In [63]:
print df[['PUMA','FINCP','GRNTP']].head(10)

   PUMA     FINCP   GRNTP
0  3806       NaN  2760.0
1  3205   26020.0     NaN
2  3312       NaN     NaN
3  4004   15600.0   420.0
4  1500       NaN     NaN
5  1400       NaN     NaN
6  1500   81800.0     NaN
7   904       NaN     NaN
8  3203   46040.0     NaN
9   200  102600.0     NaN


In [51]:
print df.shape

(92817, 235)


In [59]:
print df.PUMA.nunique()
print df.FINCP.nunique()
print df.GRNTP.nunique()
#print df.PUMA.value_counts()

145
7873
1685


In [64]:
print df.ST.value_counts() #Make sure it's only NY State data

36    92817
Name: ST, dtype: int64


In [90]:
#filter out 232 other columns
df1 = df[['PUMA','FINCP','GRNTP']]

In [81]:
#what if we filter out records with null values in any of these three columns?
print df[pd.notnull(df['FINCP'])].shape
print df[pd.notnull(df['GRNTP'])].shape 

(48436, 235)
(26560, 235)


In [83]:
# only 48,436 reporting income
# only 26,560 reporting rent
# next: build table of average GRNTP * 12 / average FINCP for each PUMA

In [129]:
avgs_per_puma = df1.groupby(by='PUMA').mean() #does this ignore the NaNs?

In [130]:
#print avgs_per_puma
print type(avgs_per_puma)

<class 'pandas.core.frame.DataFrame'>


In [142]:
avgs_per_puma['rent_burden'] = avgs_per_puma['GRNTP']*12/avgs_per_puma['FINCP']

In [143]:

print avgs_per_puma.columns

Index([u'FINCP', u'GRNTP', u'rent_burden', u'rent_burden_percent'], dtype='object')


In [145]:
avgs_per_puma.to_csv('rent_burden.csv')

In [144]:
print avgs_per_puma

              FINCP        GRNTP  rent_burden  rent_burden_percent
PUMA                                                              
100    68960.468271   836.325581     0.145531                   14
200    73745.153614   811.250000     0.132009                   13
300    95146.198630   859.746377     0.108433                   10
401    78560.884058   698.226415     0.106653                   10
402    81234.534826   781.323276     0.115417                   11
403    79536.617886   834.704819     0.125935                   12
500    76505.660194   920.662338     0.144407                   14
600    70220.836406   730.755556     0.124878                   12
701    68064.697368   828.530055     0.146072                   14
702    97361.892351   946.548077     0.116663                   11
703   106445.956967   976.235294     0.110054                   11
704    92479.949627   727.507576     0.094400                    9
800    83291.344196   747.440299     0.107686                 