In [46]:
import pandas as pd
import json
from numpy import nan
from geopy import distance
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [47]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [48]:
data = data.rename(columns={'hdpData.homeInfo.longitude': 'lon', 'hdpData.homeInfo.latitude': 'lat'}, errors='raise')

In [49]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [50]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [51]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [52]:
def get_neighborhood(row):
    lon = row['lon']
    lat = row['lat']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [53]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].sample(20)

Unnamed: 0,address,neighborhood
4509,"3843 S Giles Ave # 45, Chicago, IL",Douglas
8181,"1521 N Paulina St # 4F, Chicago, IL",Wicker Park
3312,"5600 S Washtenaw Ave, Chicago, IL",Gage Park
10119,"2107 N Magnolia Ave APT 1B, Chicago, IL",Lincoln Park
10606,"2444 W Dakin St APT 1, Chicago, IL",Irving Park
12148,"6548 N Spokane Ave, Chicago, IL","Sauganash,Forest Glen"
12425,"5555 N Sheridan Rd APT 1014, Chicago, IL",Edgewater
3253,"5516 S Seeley Ave, Chicago, IL",Englewood
8962,"1120 N Lake Shore Dr # 2B, Chicago, IL",Gold Coast
2314,"7700 S Lowe Ave, Chicago, IL",Auburn Gresham


In [54]:
# Convert prices to ints, drop any entries without prices
data['price'] = data.apply(lambda row: nan if (price:=row['price'].replace('$', '').replace(',', '')) == '' else int(price), axis=1)
data = data.dropna(subset=['price'])

In [55]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {(p:=make_point(point))[0]: p[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [56]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['lon']
    lat = row['lat']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].sample(20)

Unnamed: 0,coords,line,station,neighborhood,address
12205,POINT (-87.71314203532829 41.96792815058559),Brown Line,Kimball,"Sauganash,Forest Glen","6112 N Kedvale Ave, Chicago, IL"
436,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"12034 S Eggleston Ave, Chicago, IL"
1763,POINT (-87.73795637168769 41.78661355009723),Orange Line,Midway,Ashburn,"4663 W 83rd Pl, Chicago, IL"
2815,POINT (-87.64402312721801 41.77902786718354),Green Line,Halsted,Englewood,"6715 S Union Ave, Chicago, IL"
9323,POINT (-87.71235935057341 41.93813166884105),Blue Line,Belmont,Logan Square,"2637 N Hamlin Ave, Chicago, IL"
5381,POINT (-87.74469840019536 41.88651935258224),Green Line (Lake),Cicero,Austin,"4741-43 W Madison St # 2, Chicago, IL"
11093,POINT (-87.65338048799256 41.93975074521462),"Red, Brown, Purple (Express)",Belmont,Lake View,"3228 N Clifton Ave # 3N, Chicago, IL"
5711,POINT (-87.65212799206752 41.88557678778864),"Green (Lake), Pink",Morgan,West Loop,"901 W Madison St UNIT 502, Chicago, IL"
6897,POINT (-87.66695093661281 41.88531139464179),"Green (Lake), Pink",Ashland,West Town,"1424 W Grand Ave APT 5, Chicago, IL"
10594,POINT (-87.7190993021528 41.94736010155727),Blue Line,Addison,Irving Park,"3804 N Troy St APT 1, Chicago, IL"


In [57]:
def manhattan_distance(row):
    home_lon = row['lon']
    home_lat = row['lat']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['lon', 'lat', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,lon,lat,coords,stationDistance,station
2238,-87.695326,41.746769,POINT (-87.66384484844021 41.77895340470035),3.848296,Ashland
9617,-87.663416,41.922755,POINT (-87.6528658377562 41.92505136765153),0.702278,Fullerton
3205,-87.721459,41.791159,POINT (-87.72444964277844 41.79988781946131),0.756886,Pulaski
7336,-87.629242,41.892817,POINT (-87.62802125739587 41.89166520044598),0.142444,Grand
12936,-87.683775,42.009747,POINT (-87.6728924507902 42.01906321923584),1.203146,Howard
11583,-87.730712,41.964735,POINT (-87.72917726558204 41.95294026078555),0.893094,Irving Park
9758,-87.68156,41.938359,POINT (-87.67473610744641 41.94702865443908),0.949994,Addison
6914,-87.667391,41.892286,POINT (-87.66695093661281 41.88531139464179),0.504055,Ashland
11551,-87.724971,41.975298,POINT (-87.71314203532829 41.96792815058559),1.117851,Kimball
7789,-87.616721,41.891725,POINT (-87.62802125739587 41.89166520044598),0.586862,Grand


In [58]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')

In [62]:
with pd.option_context("display.max_rows", 100):
    display(data[['neighborhood', 'price']].groupby('neighborhood').median().sort_values(by=['price'], ascending=False))

Unnamed: 0_level_0,price
neighborhood,Unnamed: 1_level_1
Boystown,862500.0
Gold Coast,818000.0
Bucktown,767500.0
Andersonville,762500.0
Wrigleyville,700000.0
Sheffield & DePaul,699000.0
Old Town,675000.0
Lincoln Park,649888.0
Rush & Division,637500.0
Chinatown,630000.0
