In [123]:
import pandas as pd
import json
from numpy import nan
from geopy import distance
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [124]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [125]:
data = data.rename(columns={
    'latLong.longitude': 'lon', 
    'latLong.latitude': 'lat',
    'hdpData.homeInfo.yearBuilt': 'yearBuilt',
    'hdpData.homeInfo.homeType': 'homeType'
    }, errors='raise')

In [126]:
data = data[['price', 'beds', 'baths', 'area', 'lat', 'lon', 'yearBuilt', 'homeType']]

In [127]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [128]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [129]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [130]:
def get_neighborhood(row):
    lon = row['lon']
    lat = row['lat']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [131]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['lat', 'lon', 'neighborhood']].sample(20)

Unnamed: 0,lat,lon,neighborhood
12287,41.997712,-87.674043,Edgewater
9420,41.923879,-87.722731,Logan Square
506,41.674874,-87.627029,West Pullman
4444,41.835859,-87.638049,Bridgeport
11109,41.942484,-87.656263,Lake View
8841,41.899792,-87.628724,Rush & Division
6905,41.900465,-87.695925,Humboldt Park
5720,41.871161,-87.656482,"Little Italy, UIC"
834,41.693623,-87.623477,Roseland
6571,41.88267,-87.75142,Austin


In [132]:
# Convert prices to ints, drop any entries without prices
data['price'] = data.apply(lambda row: nan if (price:=row['price'].replace('$', '').replace(',', '')) == '' else int(price), axis=1)

print(f'Records before drop: {data.shape}')
data = data.dropna()
data = data[data['yearBuilt'] > 0]
print(f'Records after drop: {data.shape}')
data.sample(20)

Records before drop: (9758, 9)
Records after drop: (7142, 9)


Unnamed: 0,price,beds,baths,area,lat,lon,yearBuilt,homeType,neighborhood
11880,188000.0,1.0,1.0,800.0,41.979076,-87.65546,1984,MULTI_FAMILY,Edgewater
2455,74000.0,6.0,2.0,2400.0,41.761545,-87.597305,1895,MULTI_FAMILY,Grand Crossing
5866,319900.0,1.0,2.0,817.0,41.87833,-87.642109,2009,CONDO,West Loop
1907,199900.0,4.0,1.5,902.0,41.732959,-87.658978,1913,SINGLE_FAMILY,Auburn Gresham
8057,289900.0,4.0,2.0,1000.0,41.907902,-87.724893,1955,SINGLE_FAMILY,Humboldt Park
7001,489000.0,3.0,2.0,2000.0,41.893103,-87.648938,1996,TOWNHOUSE,West Town
9310,299000.0,4.0,2.0,1300.0,41.934205,-87.711125,1960,SINGLE_FAMILY,Avondale
2516,298000.0,3.0,2.0,1100.0,41.775385,-87.794704,1964,SINGLE_FAMILY,Clearing
2591,145000.0,5.0,2.0,900.0,41.773121,-87.711357,1922,SINGLE_FAMILY,Chicago Lawn
2452,100026.0,4.0,2.0,1634.0,41.757442,-87.604402,1885,SINGLE_FAMILY,Grand Crossing


In [133]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {(p:=make_point(point))[0]: p[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [134]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['lon']
    lat = row['lat']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood']].sample(20)

Unnamed: 0,coords,line,station,neighborhood
6071,POINT (-87.62658988700279 41.86740495869866),Orange & Green Lines,Roosevelt,Near South Side
11116,POINT (-87.65362593077103 41.94742799653081),Red Line,Addison,Lake View
5542,POINT (-87.66954322799872 41.87156300339059),Pink,Polk,"Little Italy, UIC"
5021,POINT (-87.62640238492894 41.85311508247467),Green Line,Cermak-McCormick Pl,Near South Side
11684,POINT (-87.67863488433055 41.96641097182644),Brown Line,Damen,Lincoln Square
8042,POINT (-87.72540376831816 41.88541197876197),Green Line (Lake),Pulaski,Humboldt Park
8708,POINT (-87.63141229300709 41.90392031189563),Red Line,Clark/Division,Old Town
10180,POINT (-87.65313085718887 41.93273150006873),"Brown, Purple (Express)",Diversey,Lincoln Park
9544,POINT (-87.67088394357924 41.94370914759255),Brown Line,Paulina,Lake View
10301,POINT (-87.80889493503285 41.98232336946857),Blue Line (O'Hare),Harlem,Dunning


In [135]:
def manhattan_distance(row):
    home_lon = row['lon']
    home_lat = row['lat']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['lon', 'lat', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,lon,lat,coords,stationDistance,station
1759,-87.72474,41.735383,POINT (-87.73795637168769 41.78661355009723),4.218874,Midway
9405,-87.72523,41.925767,POINT (-87.7085413860205 41.92972804131357),1.133524,Logan Square
888,-87.65842,41.693336,POINT (-87.62441474538349 41.72237598330533),3.763179,95/Dan Ryan
7572,-87.616094,41.885689,POINT (-87.62618878473235 41.88322001199881),0.69102,Washington/Wabash
6891,-87.66331,41.882026,POINT (-87.66695093661281 41.88531139464179),0.414531,Ashland
8861,-87.642656,41.914004,POINT (-87.63930216454519 41.91040917935941),0.420994,Sedgwick
838,-87.634455,41.682255,POINT (-87.62441474538349 41.72237598330533),3.288362,95/Dan Ryan
8627,-87.63018,41.90712,POINT (-87.63141229300709 41.90392031189563),0.284364,Clark/Division
12061,-87.797556,41.993995,POINT (-87.80889493503285 41.98232336946857),1.389343,Harlem
9830,-87.642375,41.934055,POINT (-87.65313085718887 41.93273150006873),0.645638,Diversey


In [136]:
# Turn neighborhood and home type data into numeric form
data = pd.concat([data, pd.get_dummies(data['neighborhood'], drop_first=True)], axis=1)
data = pd.concat([data, pd.get_dummies(data['homeType'], drop_first=True)], axis=1)

In [137]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')