In [12]:
import pandas as pd
import json
from geopy import distance
from math import sin, cos, sqrt, atan2
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [13]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [14]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [15]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [16]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [17]:
def get_neighborhood(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [18]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].sample(20)

Unnamed: 0,address,neighborhood
11927,"4950 N Marine Dr APT 1003, Chicago, IL",Uptown
5826,"1000 S Michigan Ave # 6802, Chicago, IL",Loop
12055,"5716 N Oriole Ave, Chicago, IL",Norwood Park
8083,"1842 N Albany Ave, Chicago, IL",Logan Square
1506,"10414 S Maryland Ave, Chicago, IL",Pullman
8433,"1747 N Honore St, Chicago, IL",Wicker Park
7552,"505 N Lake Shore Dr APT 4806, Chicago, IL",Streeterville
8888,"70 E Scott St APT 607, Chicago, IL",Gold Coast
5894,"1234 S State St, Chicago, IL",Near South Side
6847,"1516 W Grand Ave APT 4W, Chicago, IL",West Town


In [19]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {make_point(point)[0]: make_point(point)[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [20]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].sample(20)

Unnamed: 0,coords,line,station,neighborhood,address
9354,POINT (-87.7085413860205 41.92972804131357),Blue Line,Logan Square,Avondale,"2925 N Whipple St UNIT 1, Chicago, IL"
4795,POINT (-87.7148802641176 41.85399195400796),Pink,Central Park,North Lawndale,"1805 S Trumbull Ave, Chicago, IL"
510,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"11845 S Perry Ave, Chicago, IL"
8421,POINT (-87.6873643813982 41.91615742819355),Blue Line (O'Hare),Western,Wicker Park,"1302 N Artesian Ave # E, Chicago, IL"
10184,POINT (-87.65313085718887 41.93273150006873),"Brown, Purple (Express)",Diversey,Lake View,"340 W Diversey Pkwy APT 2018, Chicago, IL"
3850,POINT (-87.61850159987594 41.80209158340622),Green Line,51st,Grand Boulevard,"650 E 51st St # C, Chicago, IL"
8846,POINT (-87.63141229300709 41.90392031189563),Red Line,Clark/Division,Gold Coast,"45 E Cedar St APT 200, Chicago, IL"
6623,POINT (-87.70615542049478 41.8843210952809),Green Line (Lake),Kedzie,Humboldt Park,"643 N Sawyer Ave, Chicago, IL"
10505,POINT (-87.7190993021528 41.94736010155727),Blue Line,Addison,Irving Park,"3853 N Kimball Ave, Chicago, IL"
11691,POINT (-87.65852996352429 41.97345332194149),Red Line,Argyle,Uptown,"1254 W Winnemac Ave # 1N, Chicago, IL"


In [21]:
def manhattan_distance(row):
    home_lon = row['hdpData.homeInfo.longitude']
    home_lat = row['hdpData.homeInfo.latitude']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['hdpData.homeInfo.longitude', 'hdpData.homeInfo.latitude', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,hdpData.homeInfo.longitude,hdpData.homeInfo.latitude,coords,stationDistance,station
2774,-87.65872,41.769082,POINT (-87.66384484844021 41.77895340470035),0.94606,Ashland
4039,-87.602052,41.817741,POINT (-87.61903578127783 41.81646322193995),0.96502,43rd
5937,-87.625378,41.868581,POINT (-87.62658988700279 41.86740495869866),0.143683,Roosevelt
7032,-87.654808,41.898323,POINT (-87.65521427917493 41.89607523646576),0.176082,Chicago
7482,-87.618799,41.887493,POINT (-87.62618878473235 41.88322001199881),0.67601,Washington/Wabash
5544,-87.693407,41.876511,POINT (-87.68843451104553 41.87552957304013),0.324218,Western
3837,-87.659258,41.803863,POINT (-87.6840649867607 41.80468355911743),1.337636,Western
8096,-87.708807,41.906193,POINT (-87.69688979878789 41.92193917056461),1.701159,California
6691,-87.7098,41.885117,POINT (-87.70615542049478 41.8843210952809),0.242894,Kedzie
4812,-87.672074,41.858364,POINT (-87.66916012488008 41.85790770749379),0.181833,18th


In [22]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')