In [57]:
import pandas as pd
import json
from geopy import distance
from math import sin, cos, sqrt, atan2
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [58]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [59]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [60]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [61]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [62]:
def get_neighborhood(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [63]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].sample(20)

Unnamed: 0,address,neighborhood
1983,"549 E 91st St, Chicago, IL",Chatham
8623,"1943 N Hudson Ave UNIT A, Chicago, IL",Old Town
6520,"4826 W Hubbard St, Chicago, IL",Austin
3956,"4800 S Chicago Beach Dr APT 611N, Chicago, IL",Kenwood
10743,"2051 W Grace St, Chicago, IL",North Center
5390,"4818 W Gladys Ave, Chicago, IL",Austin
418,"11835 S Lafayette Ave, Chicago, IL",West Pullman
11515,"5319 N Virginia Ave APT 2N, Chicago, IL",Lincoln Square
5498,"1255 S Millard Ave, Chicago, IL",North Lawndale
4394,"3309 S Michigan Ave # 1, Chicago, IL",Douglas


In [64]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {make_point(point)[0]: make_point(point)[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [65]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].sample(20)

Unnamed: 0,coords,line,station,neighborhood,address
3955,POINT (-87.60585741747852 41.78030876683156),Green Line,Cottage Grove,Kenwood,"(undisclosed Address), Chicago, IL"
9294,POINT (-87.72917726558204 41.95294026078555),Blue Line,Irving Park,Belmont Cragin,"5101 W Montana St APT 3, Chicago, IL"
11751,POINT (-87.69411112481461 41.96620994853978),Brown Line,Rockwell,Lincoln Square,"2652 W Rascher Ave APT 105, Chicago, IL"
7985,POINT (-87.77413525552275 41.88729278376865),Green Line (Lake),Austin,Austin,"1455 N Mayfield Ave, Chicago, IL"
1857,POINT (-87.62475307324942 41.73537121043779),Red Line,87th,Auburn Gresham,"8550 S Normal Ave, Chicago, IL"
1938,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,Calumet Heights,"9125 S Harper Ave, Chicago, IL"
4944,POINT (-87.62640238492894 41.85311508247467),Green Line,Cermak-McCormick Pl,Near South Side,"1802 S State St # 206, Chicago, IL"
4874,POINT (-87.62640238492894 41.85311508247467),Green Line,Cermak-McCormick Pl,Near South Side,"1601 S Indiana Ave APT 110, Chicago, IL"
7549,POINT (-87.62802125739587 41.89166520044598),Red Line,Grand,Streeterville,"600 N Fairbanks Ct UNIT 2205, Chicago, IL"
12785,POINT (-87.6728924507902 42.01906321923584),"Red, Yellow, Purple, Evanston Express",Howard,West Ridge,"2938 W Pratt Blvd, Chicago, IL"


In [78]:
def manhattan_distance(row):
    home_lon = row['hdpData.homeInfo.longitude']
    home_lat = row['hdpData.homeInfo.latitude']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['hdpData.homeInfo.longitude', 'hdpData.homeInfo.latitude', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,hdpData.homeInfo.longitude,hdpData.homeInfo.latitude,coords,stationDistance,station
2444,-87.601173,41.750467,POINT (-87.62517835384935 41.75041234179246),1.24441,79th
12497,-87.654696,41.988733,POINT (-87.6590758588342 41.99025949308111),0.330875,Thorndale
7480,-87.615929,41.886201,POINT (-87.62618878473235 41.88322001199881),0.734862,Washington/Wabash
8900,-87.626568,41.909015,POINT (-87.63141229300709 41.90392031189563),0.601362,Clark/Division
3874,-87.615893,41.807086,POINT (-87.61882612858351 41.8092088340794),0.297964,47th
12242,-87.703566,41.996211,POINT (-87.71314203532829 41.96792815058559),2.445026,Kimball
10748,-87.666725,41.945454,POINT (-87.66360955994172 41.94381225523734),0.273832,Southport
7251,-87.634754,41.883629,POINT (-87.63385352993139 41.88269501756125),0.110902,Washington/Wells
2816,-87.659395,41.767895,POINT (-87.66384484844021 41.77895340470035),0.993109,Ashland
8800,-87.652251,41.915025,POINT (-87.65264430600405 41.91821656715077),0.240546,Armitage


In [79]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')