In [1]:
import pandas as pd
import json
from numpy import nan
from geopy import distance
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [2]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [3]:
data = data.rename(columns={'hdpData.homeInfo.longitude': 'lon', 'hdpData.homeInfo.latitude': 'lat'}, errors='raise')

In [4]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [5]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [6]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [7]:
def get_neighborhood(row):
    lon = row['lon']
    lat = row['lat']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [8]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].sample(20)

Unnamed: 0,address,neighborhood
8246,"2140 N Richmond St APT 5, Chicago, IL",Logan Square
9741,"2615 N Bosworth Ave, Chicago, IL",Lincoln Park
3847,"4931 S Prairie Ave APT 3, Chicago, IL",Grand Boulevard
4317,"3030 S Kostner Ave, Chicago, IL",Little Village
12499,"6147 N Sheridan Rd APT 30B, Chicago, IL",Edgewater
2310,"8259 S Aberdeen St, Chicago, IL",Auburn Gresham
3272,"5423 S Fairfield Ave, Chicago, IL",Gage Park
3240,"5205 S Wood St, Chicago, IL",New City
8434,"1838 N Marshfield Ave, Chicago, IL",Bucktown
3479,"1461 E 56th St # 1W, Chicago, IL",Hyde Park


In [9]:
# Convert prices to ints, drop any entries without prices
data['price'] = data.apply(lambda row: nan if (price:=row['price'].replace('$', '').replace(',', '')) == '' else int(price), axis=1)
data = data.dropna(subset=['price'])

In [10]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {(p:=make_point(point))[0]: p[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [11]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['lon']
    lat = row['lat']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].sample(20)

Unnamed: 0,coords,line,station,neighborhood,address
8878,POINT (-87.63141229300709 41.90392031189563),Red Line,Clark/Division,Gold Coast,"1110 N Lake Shore Dr APT 27S, Chicago, IL"
7973,POINT (-87.75498649428108 41.88716322560614),Green Line (Lake),Laramie,Austin,"5047 W Crystal St, Chicago, IL"
750,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,Mount Greenwood,"10943 S Whipple St, Chicago, IL"
9493,POINT (-87.66360955994172 41.94381225523734),Brown Line,Southport,Lake View,"1443 W Belmont Ave # 1, Chicago, IL"
2638,POINT (-87.66384484844021 41.77895340470035),Green Line (Englewood),Ashland,Chicago Lawn,"6429 S Campbell Ave, Chicago, IL"
7460,POINT (-87.62802125739587 41.89166520044598),Red Line,Grand,Streeterville,"233 E Erie St APT 2205, Chicago, IL"
6123,POINT (-87.62658988700279 41.86740495869866),Orange & Green Lines,Roosevelt,Loop,"1000 S Michigan Ave # 5204, Chicago, IL"
12457,POINT (-87.6590758588342 41.99025949308111),Red Line,Thorndale,Edgewater,"5855 N Sheridan Rd APT 10B, Chicago, IL"
10560,POINT (-87.70165573251133 41.96614689495171),Brown Line,Francisco,Irving Park,"3811 N Sacramento Ave, Chicago, IL"
5538,POINT (-87.66954322799872 41.87156300339059),Pink,Polk,"Little Italy, UIC","828 S Bishop St, Chicago, IL"


In [12]:
def manhattan_distance(row):
    home_lon = row['lon']
    home_lat = row['lat']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['lon', 'lat', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,lon,lat,coords,stationDistance,station
427,-87.622993,41.677423,POINT (-87.62441474538349 41.72237598330533),3.175952,95/Dan Ryan
7888,-87.795337,41.915372,POINT (-87.79378335175879 41.88698752067567),2.039088,Oak Park
7845,-87.795168,41.909904,POINT (-87.79378335175879 41.88698752067567),1.652999,Oak Park
7425,-87.64999,41.8936,POINT (-87.64757831015247 41.89118929312991),0.290741,Grand
11842,-87.652205,41.974801,POINT (-87.65852996352429 41.97345332194149),0.418758,Argyle
9657,-87.691173,41.931006,POINT (-87.69688979878789 41.92193917056461),0.920389,California
6656,-87.735047,41.900054,POINT (-87.74469840019536 41.88651935258224),1.431755,Cicero
9344,-87.722372,41.929472,POINT (-87.71235935057341 41.93813166884105),1.113695,Belmont
3166,-87.72771,41.787059,POINT (-87.73795637168769 41.78661355009723),0.559992,Midway
10682,-87.6685,41.9477,POINT (-87.67088394357924 41.94370914759255),0.398265,Paulina


In [13]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')