In [17]:
import pandas as pd
import json
from geopy import distance
from math import sin, cos, sqrt, atan2
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [18]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [19]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [20]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [21]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [22]:
def get_neighborhood(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [23]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].sample(20)

Unnamed: 0,address,neighborhood
9877,"1235 W George St APT 214, Chicago, IL",Lake View
10949,"655 W Irving Park Rd APT 2502, Chicago, IL",Lake View
9980,"631 W Schubert Ave, Chicago, IL",Lincoln Park
5986,"1000 W Adams St APT 817, Chicago, IL",West Loop
10384,"4420 N Meade Ave, Chicago, IL",Portage Park
8084,"2121 N St Louis Ave # 3S, Chicago, IL",Logan Square
2806,"1259 W 72nd Pl, Chicago, IL",Englewood
5893,"1025 W Monroe St UNIT 3W, Chicago, IL",West Loop
5448,"3531 W Van Buren St, Chicago, IL",Garfield Park
3437,"1317 E 52nd St # 2, Chicago, IL",Hyde Park


In [24]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {(p:=make_point(point))[0]: p[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [25]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].sample(20)

Unnamed: 0,coords,line,station,neighborhood,address
11724,POINT (-87.68850260590037 41.96625004727976),Brown Line,Western,Lincoln Square,"4751 N Artesian Ave APT 203, Chicago, IL"
454,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"12213 S Carpenter St, Chicago, IL"
6775,POINT (-87.67743665449984 41.9097437915267),Blue Line,Damen,Ukrainian Village,"915 N Hoyne Ave # 4, Chicago, IL"
10700,POINT (-87.67473610744641 41.94702865443908),Brown Line,Addison,North Center,"3319 N Leavitt St, Chicago, IL"
6752,POINT (-87.66649606690224 41.90335508455351),Blue Line,Division,East Village,"930 N Wood St # 3, Chicago, IL"
11766,POINT (-87.68850260590037 41.96625004727976),Brown Line,Western,Lincoln Square,"2436 W Winnemac Ave, Chicago, IL"
4421,POINT (-87.63063629512068 41.83119055925151),Red Line,Sox-35th,Bridgeport,"3450 S Halsted St UNIT 209, Chicago, IL"
9313,POINT (-87.72917726558204 41.95294026078555),Blue Line,Irving Park,Irving Park,"4152 W School St, Chicago, IL"
3398,POINT (-87.66384484844021 41.77895340470035),Green Line (Englewood),Ashland,Englewood,"5518 S Bishop St, Chicago, IL"
8689,POINT (-87.63141229300709 41.90392031189563),Red Line,Clark/Division,Gold Coast,"1358 N Dearborn St, Chicago, IL"


In [26]:
def manhattan_distance(row):
    home_lon = row['hdpData.homeInfo.longitude']
    home_lat = row['hdpData.homeInfo.latitude']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['hdpData.homeInfo.longitude', 'hdpData.homeInfo.latitude', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,hdpData.homeInfo.longitude,hdpData.homeInfo.latitude,coords,stationDistance,station
7785,-87.621348,41.89953,POINT (-87.62817635256233 41.89667121189094),0.549387,Chicago
8821,-87.627926,41.909014,POINT (-87.63141229300709 41.90392031189563),0.531283,Clark/Division
8900,-87.626568,41.909015,POINT (-87.63141229300709 41.90392031189563),0.601362,Clark/Division
3861,-87.590812,41.806859,POINT (-87.61850159987594 41.80209158340622),1.758818,51st
10606,-87.702676,41.947502,POINT (-87.71235935057341 41.93813166884105),1.145631,Belmont
4778,-87.726606,41.86135,POINT (-87.72432672502003 41.85385683028695),0.634743,Pulaski
4379,-87.652114,41.831844,POINT (-87.66531166332861 41.83935587785577),1.199654,Ashland
8213,-87.681456,41.916376,POINT (-87.6873643813982 41.91615742819355),0.319653,Western
3353,-87.629361,41.784277,POINT (-87.63100044907034 41.78053433363163),0.342986,63rd
11647,-87.669353,41.97982,POINT (-87.65866841969589 41.97798399160598),0.676944,Berwyn


In [27]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')