In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import pandas as pd
import geopandas
from shapely.geometry import Point
import matplotlib.pyplot as plt

In [2]:
# read scraped CHA data
with open("data/CHA_rental_data.obj", "rb") as f:       
    d = pickle.load(f)
cha = pd.DataFrame.from_dict(data = d, orient = "index")

In [3]:
# clean CHA data
cols = ['Address','Monthly Rent','Property Type','Bath','Bed',
        'Voucher Necessary','Availability','Contact','URL','Lat','Long']
cha = cha[cols]
cha.Long = -1 * cha.Long

# correct error
cha.loc["4545145", "Long"] = -87.66593 
cha.loc["4545145", "Lat"] = 41.772175

In [4]:
cha.head()

Unnamed: 0,Address,Monthly Rent,Property Type,Bath,Bed,Voucher Necessary,Availability,Contact,URL,Lat,Long
1288108,"1718 W 66th St 1, Chicago, IL 60636",800,Apt,1.0,2.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.774,-87.6673
4012748,"6130 S Eberhart Ave 1, Chicago, IL 60637",1200,Apt,2.0,3.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.783,-87.6136
4017021,"4827 S Seeley Ave , Chicago, IL 60609",600,Apt,1.0,1.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.8058,-87.6756
4036551,"828 E 88th Pl 2, Chicago, IL 60619",875,Apt,1.0,3.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.73419,-87.603472
4036578,"4620 S Evans Ave 1, Chicago, IL 60653",1150,Apt,1.0,3.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.810646,-87.608165


In [4]:
cha.dtypes

Address               object
Monthly Rent           int64
Property Type         object
Bath                 float64
Bed                  float64
Voucher Necessary     object
Availability          object
Contact               object
URL                   object
Lat                  float64
Long                 float64
dtype: object

In [5]:
# convert to GeoDataFrame
cha['Coordinates'] = list(zip(cha.Long, cha.Lat))

In [6]:
cha['Coordinates'] = cha['Coordinates'].apply(Point)

In [7]:
gcha = geopandas.GeoDataFrame(cha, geometry='Coordinates')

In [8]:
# read block group geojson
blocks_full = geopandas.read_file("data/block-groups.geojson")

In [9]:
blocks = blocks_full[['geometry', 'GEOID']]

In [10]:
blocks.head()

Unnamed: 0,geometry,GEOID
0,"(POLYGON ((-87.655192 41.983802, -87.655914999...",170310306041
1,(POLYGON ((-87.65111443516709 41.9840326124377...,170310306042
2,"(POLYGON ((-87.656031 41.987344, -87.655303 41...",170310306043
3,"(POLYGON ((-87.66006899999999 41.983622, -87.6...",170310307011
4,"(POLYGON ((-87.65454699999999 41.980646, -87.6...",170310307021


In [11]:
cha_with_geoid = geopandas.sjoin(gcha, blocks, how="left", op='intersects')

  warn('CRS of frames being joined does not match!')


In [12]:
cha_with_geoid.head()

Unnamed: 0,Address,Monthly Rent,Property Type,Bath,Bed,Voucher Necessary,Availability,Contact,URL,Lat,Long,Coordinates,index_right,GEOID
1288108,"1718 W 66th St 1, Chicago, IL 60636",800,Apt,1.0,2.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.774,-87.6673,POINT (-87.6673 41.774),3028,170316712002
4012748,"6130 S Eberhart Ave 1, Chicago, IL 60637",1200,Apt,2.0,3.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.783,-87.6136,POINT (-87.61360000000001 41.783),3375,170314206001
4017021,"4827 S Seeley Ave , Chicago, IL 60609",600,Apt,1.0,1.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.8058,-87.6756,POINT (-87.6756 41.8058),557,170316115001
4036551,"828 E 88th Pl 2, Chicago, IL 60619",875,Apt,1.0,3.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.73419,-87.603472,POINT (-87.603472 41.73419),2756,170314408001
4036578,"4620 S Evans Ave 1, Chicago, IL 60653",1150,Apt,1.0,3.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.810646,-87.608165,POINT (-87.608165 41.810646),1178,170318436002


In [13]:
shp_filepath = "data/ZillowNeighborhoods-IL.shp"

In [14]:
zillow_neighborhoods = geopandas.read_file(shp_filepath)

In [15]:
zillow_neighborhoods.head()

Unnamed: 0,State,County,City,Name,RegionID,geometry
0,IL,Cook,Chicago,Wicker Park,7930,POLYGON ((-87.66743176018612 41.90457224446712...
1,IL,Cook,Chicago,Ravenswood,33597,POLYGON ((-87.66746751104743 41.96890188491437...
2,IL,Carroll,Lanark,Lake Carroll,35553,POLYGON ((-89.89474778399995 42.19632197400006...
3,IL,Cook,Chicago,Edgebrook,51847,POLYGON ((-87.76186400199992 42.00841336200006...
4,IL,Cook,Chicago,Clearing (W),137841,POLYGON ((-87.76188485799992 41.78500995800005...


In [19]:
cha_with_geoid.drop('index_right', axis=1, inplace=True)

KeyError: "['index_right'] not found in axis"

In [20]:
cha_geoid_zillow = geopandas.sjoin(cha_with_geoid, zillow_neighborhoods, how="left", op='intersects')

In [21]:
cha_geoid_zillow

Unnamed: 0,Address,Monthly Rent,Property Type,Bath,Bed,Voucher Necessary,Availability,Contact,URL,Lat,Long,Coordinates,GEOID,index_right,State,County,City,Name,RegionID
1288108,"1718 W 66th St 1, Chicago, IL 60636",800,Apt,1.0,2.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.774000,-87.667300,POINT (-87.6673 41.774),170316712002,82.0,IL,Cook,Chicago,West Englewood,269612
4012748,"6130 S Eberhart Ave 1, Chicago, IL 60637",1200,Apt,2.0,3.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.783000,-87.613600,POINT (-87.61360000000001 41.783),170314206001,202.0,IL,Cook,Chicago,West Woodlawn,403353
4017021,"4827 S Seeley Ave , Chicago, IL 60609",600,Apt,1.0,1.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.805800,-87.675600,POINT (-87.6756 41.8058),170316115001,143.0,IL,Cook,Chicago,Back of the Yards,403146
4036551,"828 E 88th Pl 2, Chicago, IL 60619",875,Apt,1.0,3.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.734190,-87.603472,POINT (-87.603472 41.73419),170314408001,10.0,IL,Cook,Chicago,Burnside,137634
4036578,"4620 S Evans Ave 1, Chicago, IL 60653",1150,Apt,1.0,3.0,Yes,Check Availability,,http://chicagoha.gosection8.com/Section-8-hous...,41.810646,-87.608165,POINT (-87.608165 41.810646),170318436002,142.0,IL,Cook,Chicago,Bronzeville,403145
4062682,"8200 S Escanaba Ave 1r, Chicago, IL 60617",1000,Duplex,2.0,4.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.746300,-87.554300,POINT (-87.5543 41.7463),170314603024,24.0,IL,Cook,Chicago,South Chicago,140437
4062710,"211 N Laporte Ave 2, Chicago, IL 60644",950,Duplex,1.0,2.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.884900,-87.748800,POINT (-87.7488 41.8849),170312518005,176.0,IL,Cook,Chicago,South Austin,403120
4062715,"525 N Homan Ave , Chicago, IL 60624",1150,Apt,1.0,3.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.891100,-87.710900,POINT (-87.7109 41.8911),170318367001,54.0,IL,Cook,Chicago,East Garfield Park,269577
4062722,"1352 W 64th St 2, Chicago, IL 60636",1150,Apt,1.0,3.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.777990,-87.658923,POINT (-87.658923 41.77799),170318349002,5.0,IL,Cook,Chicago,Englewood,138261
4062766,"1352 W 64th St 2, Chicago, IL 60636",1150,Apt,1.0,3.0,Yes,Available Now,(773) 961-4367,http://chicagoha.gosection8.com/Section-8-hous...,41.777990,-87.658923,POINT (-87.658923 41.77799),170318349002,5.0,IL,Cook,Chicago,Englewood,138261


In [20]:
### Decided to not include manual cleaning

# Drop rows with false addresses
# cha_geoid_zillow.drop(index = ['4763759','4796475'], inplace=True)

In [21]:
# Fill NA values of Zillow Region
# cha_geoid_zillow.fillna({"State": "IL", "County": "Cook", "City": "Chicago"}, inplace=True)

In [23]:
# unmatches = cha_geoid_zillow[cha_geoid_zillow["index_right"].isna()]

In [24]:
# Manually look up corresponding Zillow Regions on zillow.com  
# values = {'4567448': ['Gresham', '269571'],
#         '4590237': ['Gresham', '269571'],
#         '4618006': ['Cabrini Green', '403302'],
#         '4632177': ['Park Manor', '403356'],
#         '4640435': ['Marquette Park', '403148'],
#         '4646604': ['Rogers Park', '269605'],
#         '4729058': ['Morgan Park', '269595'],
#         '4729219': ['West Town', '269615']}

In [25]:
# for i in unmatches.index:
#    cha_geoid_zillow.loc[i, ["Name", "RegionID"]] = values[i]

In [22]:
cha_geoid_zillow.dropna(axis=0, subset=["index_right"], inplace=True)

In [24]:
len(cha_geoid_zillow)

5266

In [25]:
cha_geoid_zillow.drop('index_right', axis=1, inplace=True)

In [26]:
cha_geoid_zillow.isna().any()

Address              False
Monthly Rent         False
Property Type        False
Bath                 False
Bed                  False
Voucher Necessary    False
Availability         False
Contact               True
URL                  False
Lat                  False
Long                 False
Coordinates          False
GEOID                False
State                False
County               False
City                 False
Name                 False
RegionID             False
dtype: bool