In [46]:
import pandas as pd
import json
from math import sin, cos, sqrt, atan2
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [47]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [48]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [49]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [50]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [51]:
def get_neighborhood(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [52]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].sample(20)

Unnamed: 0,address,neighborhood
11475,"4910 N Drake Ave APT 3, Chicago, IL",Albany Park
3773,"4008 S California Ave, Chicago, IL",Brighton Park
9646,"2630 W Medill Ave, Chicago, IL",Logan Square
2422,"8000 S Indiana Ave # 1, Chicago, IL",Chatham
8660,"1632 N Orchard St # 201S, Chicago, IL",Old Town
2710,"6327 S Wood St, Chicago, IL",Englewood
11352,"5034 N Newcastle Ave, Chicago, IL",Norwood Park
4860,"31 E 26th St APT 4, Chicago, IL",Douglas
4889,"924 W 18th Pl, Chicago, IL",Lower West Side
682,"3337 W 114th St, Chicago, IL",Mount Greenwood


In [53]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {make_point(point)[0]: make_point(point)[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [54]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].sample(20)

Unnamed: 0,coords,line,station,neighborhood,address
3395,POINT (-87.64402312721801 41.77902786718354),Green Line,Halsted,Englewood,"5705 S Emerald Ave, Chicago, IL"
6617,POINT (-87.72540376831816 41.88541197876197),Green Line (Lake),Pulaski,Humboldt Park,"1030 N Ridgeway Ave, Chicago, IL"
12402,POINT (-87.68850260590037 41.96625004727976),Brown Line,Western,West Ridge,"6101 N Western Ave, Chicago, IL"
2683,POINT (-87.66384484844021 41.77895340470035),Green Line (Englewood),Ashland,Englewood,"2021 W 71st St, Chicago, IL"
12128,POINT (-87.76089227927594 41.97063413718424),Blue Line,Jefferson Park,"Sauganash,Forest Glen","6244 N Le Mai Ave, Chicago, IL"
5503,POINT (-87.72540376831816 41.88541197876197),Green Line (Lake),Pulaski,Garfield Park,"4221 W Madison St, Chicago, IL"
11550,POINT (-87.74361697430822 41.96148663210283),Blue Line,Montrose,Albany Park,"4450 W Gunnison St APT 3C, Chicago, IL"
8304,POINT (-87.6873643813982 41.91615742819355),Blue Line (O'Hare),Western,Logan Square,"2511 W Moffat St # 210-I, Chicago, IL"
7614,POINT (-87.62817635256233 41.89667121189094),Red Line,Chicago,Gold Coast,"1000 N Lake Shore Plz APT 13A, Chicago, IL"
10450,POINT (-87.76089227927594 41.97063413718424),Blue Line,Jefferson Park,Portage Park,"5506 W Eddy St, Chicago, IL"


In [55]:
def manhattan_distance(row):
    R = 3958.8
    home_lon = math.radians(row['hdpData.homeInfo.longitude'])
    home_lat = math.radians(row['hdpData.homeInfo.latitude'])
    station_lon = math.radians(row['coords'].x)
    station_lat = math.radians(row['coords'].y)

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return haversine(home_lon, station_lon) + haversine(home_lat, station_lat)

def haversine(val1, val2):
    R = 3958.8

    coord_distance = abs(val1 - val2)
    a = sin(coord_distance / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['hdpData.homeInfo.longitude', 'hdpData.homeInfo.latitude', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,hdpData.homeInfo.longitude,hdpData.homeInfo.latitude,coords,stationDistance,station
8277,-87.697201,41.910411,POINT (-87.6873643813982 41.91615742819355),1.076697,Western
3740,-87.689293,41.81946,POINT (-87.68058736862578 41.8295600871101),1.299364,35th/Archer
11275,-87.801997,41.980827,POINT (-87.80889493503285 41.98232336946857),0.579997,Harlem
3783,-87.692453,41.801862,POINT (-87.6840649867607 41.80468355911743),0.774515,Western
2670,-87.687252,41.764121,POINT (-87.66384484844021 41.77895340470035),2.642128,Ashland
3289,-87.66791,41.794783,POINT (-87.66384484844021 41.77895340470035),1.37461,Ashland
7607,-87.615103,41.893069,POINT (-87.62802125739587 41.89166520044598),0.98957,Grand
1364,-87.658682,41.713128,POINT (-87.62441474538349 41.72237598330533),3.006646,95/Dan Ryan
11068,-87.644057,41.945785,POINT (-87.65362593077103 41.94742799653081),0.774678,Addison
11509,-87.732884,41.965249,POINT (-87.74361697430822 41.96148663210283),1.001543,Montrose


In [56]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')