In [23]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
from geopandas.tools import sjoin
import pysal as ps
import numpy as np
%matplotlib inline


## ACS Demographic Data

Below, I read in raw demographic variables from ACS Block Group 5-Year Estimates in San Francisco. Data was downloaded from [American Fact Finder](http://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml). I then convert demographic counts to rates and compile them in the same dataframe.

In [24]:
pop=pd.read_csv('Data/ACS/ACS_14_5YR_B01001_with_ann.csv',skiprows=2, usecols=[1,3],\
                names=['BGFIPS10','Pop'],dtype={1:pd.np.object}).set_index('BGFIPS10')['Pop']

male=pd.read_csv('Data/ACS/ACS_14_5YR_B01001_with_ann.csv',skiprows=2, usecols=[1,5],\
                names=['BGFIPS10','Male'],dtype={1:pd.np.object}).set_index('BGFIPS10')['Male']

pov=pd.read_csv('Data/ACS/ACS_14_5YR_B17017_with_ann.csv',skiprows=2, usecols=[1,3,5],\
                names=['BGFIPS10','HH','Pov'],dtype={1:pd.np.object}).set_index('BGFIPS10')


hu=pd.read_csv('Data/ACS/ACS_14_5YR_B25001_with_ann.csv',skiprows=2, usecols=[1,3],\
                names=['BGFIPS10','HU'],dtype={1:pd.np.object}).set_index('BGFIPS10')['HU']

vacant=pd.read_csv('Data/ACS/ACS_14_5YR_B25004_with_ann.csv',skiprows=2, usecols=[1,3],\
                names=['BGFIPS10','Vacant'],dtype={1:pd.np.object}).set_index('BGFIPS10')['Vacant']


In [25]:
bgs=gpd.read_file('Data/SF_BlockGroups10.shp').set_index('BGFIPS10')
dems=pd.DataFrame(index=bgs.index)
dems['Pop']=pop
dems['PopDens1k']=pop/(bgs.area/2.59e+6)/1000
dems['pMale']=male/pop*100
dems['pHHPov']=pov.Pov/pov.HH*100
dems['VacantHU']=vacant/hu*100

## Assault

In [26]:
assaults=pd.read_csv('Data/SF_Crime_Incidents_2010-2014_Assaults.csv')
assault_loc=gpd.GeoDataFrame(geometry=assaults.apply(lambda row:Point(row['X'],row['Y']),1),\
                            crs={'init': 'epsg:4326'}).to_crs(bgs.crs)

assault_counts=sjoin(assault_loc, \
                        bgs[['geometry']].reset_index()).groupby('BGFIPS10').size().reindex(bgs.index).fillna(0)
assault_rate=assault_counts/dems['Pop']*1000
assault_rate.name='AssaultP1k'

In [27]:
drunk=pd.read_csv('Data/SF_Crime_Incidents_2010-2014_Drunkenness.csv')
drunk_loc=gpd.GeoDataFrame(geometry=drunk.apply(lambda row:Point(row['X'],row['Y']),1),\
                            crs={'init': 'epsg:4326'}).to_crs(bgs.crs)

drunk_counts=sjoin(drunk_loc, \
                        bgs[['geometry']].reset_index()).groupby('BGFIPS10').size().reindex(bgs.index).fillna(0)
drunk_counts.name='Drunk'
drunk_rate=drunk_counts/dems['Pop']*1000
drunk_rate.name='DrunkP1k'

In [28]:
bars=gpd.read_file('Data/sf_bar_locations.shp').to_crs(bgs.crs)
bar_counts=sjoin(bars, \
                        bgs[['geometry']].reset_index()).groupby('BGFIPS10').size().reindex(bgs.index).fillna(0)
bar_counts.name='Bars'
bar_dens=bar_counts/(bgs.area/2.59e+6)
bar_dens.name='BarPSqMi'

In [29]:
retail=gpd.read_file('Data/Retail_Land_Use_Parcel_Centroid.shp').to_crs(bgs.crs)
retail_counts=sjoin(retail, \
                        bgs[['geometry']].reset_index()).groupby('BGFIPS10').size().reindex(bgs.index).fillna(0)
retail_dens=retail_counts/(bgs.area/2.59e+6)
retail_dens.name='RetailPSqMi'

In [30]:
gdf=gpd.GeoDataFrame(pd.concat([assault_rate,drunk_counts, drunk_rate,bar_counts, bar_dens, retail_dens,dems],1)\
                     ,geometry=bgs.geometry)
gdf.index.name='BGFIPS10'
gdf['SqMiles']=gdf.geometry.area/2.59e+6
gdf=gdf.replace(np.inf, np.nan)
gdf[(gdf==np.inf).any(1)]
gdf=gdf.fillna(gdf.mean())
gdf.head()

Unnamed: 0_level_0,AssaultP1k,Drunk,DrunkP1k,Bars,BarPSqMi,RetailPSqMi,Pop,PopDens1k,pMale,pHHPov,VacantHU,geometry,SqMiles
BGFIPS10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
60750101001,481.632653,27.0,27.55102,2.0,9.622053,91.409506,980,4.714806,50.204082,9.859155,5.490849,POLYGON ((-211050.6276144625 -20707.0181740056...,0.207856
60750101002,81.13804,8.0,2.809975,3.0,35.6229,94.9944,2847,33.806132,55.567264,16.896985,7.334109,POLYGON ((-211308.8260894234 -20809.6687658699...,0.084215
60750102001,93.692022,12.0,11.131725,1.0,9.787183,58.723099,1078,10.550584,60.760668,5.669291,30.21978,(POLYGON ((-212986.3528985226 -20191.607399463...,0.102174
60750102002,22.06235,0.0,0.0,0.0,0.0,20.61674,2085,42.985902,47.673861,3.139717,5.27881,POLYGON ((-212577.6518196098 -21195.1989812748...,0.048504
60750102003,87.038789,2.0,1.892148,0.0,0.0,19.759272,1057,20.88555,51.561022,7.822686,9.658422,POLYGON ((-213313.2446261347 -20471.8720026677...,0.050609


In [31]:
gdf.mean()

AssaultP1k       88.652299
Drunk             6.165517
DrunkP1k          5.492257
Bars              0.743103
BarPSqMi         18.810465
RetailPSqMi     100.671642
Pop            1429.434483
PopDens1k        30.116554
pMale            50.777812
pHHPov           12.613312
VacantHU          7.419983
SqMiles           0.081697
dtype: float64

In [32]:
data=gdf.drop('geometry',1)
#geo=gdf[['geometry']]
data.to_csv('InputDataset.csv')
#geo.reset_index().to_file('BG_Geometry.shp')