In [77]:
import pandas as pd
import json
from geopy import distance
from math import sin, cos, sqrt, atan2
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [78]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [79]:
data = data.rename(columns={'hdpData.homeInfo.longitude': 'lon', 'hdpData.homeInfo.latitude': 'lat'}, errors='raise')

In [80]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [81]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [82]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [83]:
def get_neighborhood(row):
    lon = row['lon']
    lat = row['lat']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [84]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].sample(20)

Unnamed: 0,address,neighborhood
4844,"1330 W Cullerton St, Chicago, IL",Lower West Side
6865,"2138 W Grand Ave # 40, Chicago, IL",West Town
2499,"6244 S Natchez Ave, Chicago, IL",Clearing
6661,"3122 W Walton St UNIT 2, Chicago, IL",Humboldt Park
11427,"4837 N Central Ave APT 103, Chicago, IL",Jefferson Park
11111,"655 W Irving Park Rd APT 2417, Chicago, IL",Lake View
7372,"550 N Kingsbury St APT 417, Chicago, IL",River North
7724,"253 E Delaware Pl APT 7H, Chicago, IL",Streeterville
3535,"5317 S Maryland Ave # 2S, Chicago, IL",Hyde Park
3218,"4158 W 57th Pl, Chicago, IL",West Elsdon


In [85]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {(p:=make_point(point))[0]: p[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [86]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['lon']
    lat = row['lat']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].sample(20)

Unnamed: 0,coords,line,station,neighborhood,address
11234,POINT (-87.83802598504772 41.98429405688996),Blue Line,Cumberland,O'Hare,"8667 1/2 W Foster Ave UNIT 1B, Chicago, IL"
2995,POINT (-87.60585741747852 41.78030876683156),Green Line,Cottage Grove,Woodlawn,"6340 S Ingleside Ave, Chicago, IL"
5675,POINT (-87.65212799206752 41.88557678778864),"Green (Lake), Pink",Morgan,West Loop,"1155 W Madison St APT 502, Chicago, IL"
12146,POINT (-87.76089227927594 41.97063413718424),Blue Line,Jefferson Park,"Sauganash,Forest Glen","6106 N Knox Ave, Chicago, IL"
3166,POINT (-87.73795637168769 41.78661355009723),Orange Line,Midway,West Elsdon,"4158 W 58th Pl # 4158, Chicago, IL"
3234,POINT (-87.72444964277844 41.79988781946131),Orange Line,Pulaski,West Elsdon,"3615 W 54th St, Chicago, IL"
10776,POINT (-87.67088394357924 41.94370914759255),Brown Line,Paulina,Lake View,"3346 N Marshfield Ave, Chicago, IL"
8102,POINT (-87.69688979878789 41.92193917056461),Blue Line (O'Hare),California,Logan Square,"1905 N Albany Ave # 2, Chicago, IL"
9371,POINT (-87.72917726558204 41.95294026078555),Blue Line,Irving Park,Irving Park,"3244 N Kilbourn Ave UNIT 10, Chicago, IL"
9876,POINT (-87.6528658377562 41.92505136765153),"Red, Brown, Purple (Express)",Fullerton,Sheffield & DePaul,"2525 N Sheffield Ave APT 1C, Chicago, IL"


In [87]:
def manhattan_distance(row):
    home_lon = row['lon']
    home_lat = row['lat']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['lon', 'lat', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,lon,lat,coords,stationDistance,station
4473,-87.6044,41.8233,POINT (-87.61903578127783 41.81646322193995),1.227391,43rd
9199,-87.808293,41.936889,POINT (-87.80889493503285 41.98232336946857),3.166774,Harlem
11812,-87.651496,41.973701,POINT (-87.65852996352429 41.97345332194149),0.379359,Argyle
11845,-87.651968,41.971874,POINT (-87.65852996352429 41.97345332194149),0.446966,Argyle
6746,-87.681945,41.882403,POINT (-87.68843451104553 41.87552957304013),0.80908,Western
2941,-87.61286,41.779747,POINT (-87.61554588356856 41.78012971578292),0.165161,King Drive
5714,-87.652968,41.870134,POINT (-87.64964223079028 41.87551620822291),0.543021,UIC-Halsted
7078,-87.633685,41.890239,POINT (-87.63397336028807 41.88896793287827),0.102595,Merchandise Mart
7378,-87.627675,41.894355,POINT (-87.62817635256233 41.89667121189094),0.18571,Chicago
5489,-87.71706,41.865959,POINT (-87.725624930077 41.87388872350696),0.989135,Pulaski


In [88]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')