In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [2]:
with open('./results-7-31-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [3]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [4]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [5]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [6]:
def get_neighborhood(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [7]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['address', 'neighborhood']].head()

Unnamed: 0,address,neighborhood
192,"12841 S Union Ave, Chicago, IL",West Pullman
193,"12816 S Sangamon St, Chicago, IL",West Pullman
194,"901 W 129th Pl, Chicago, IL",West Pullman
199,"915 W Vermont Ave, Chicago, IL",West Pullman
202,"12907 S Normal Ave, Chicago, IL",West Pullman


In [8]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {make_point(point)[0]: make_point(point)[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [9]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['hdpData.homeInfo.longitude']
    lat = row['hdpData.homeInfo.latitude']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood', 'address']].head()

Unnamed: 0,coords,line,station,neighborhood,address
192,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"12841 S Union Ave, Chicago, IL"
193,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"12816 S Sangamon St, Chicago, IL"
194,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"901 W 129th Pl, Chicago, IL"
199,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"915 W Vermont Ave, Chicago, IL"
202,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,West Pullman,"12907 S Normal Ave, Chicago, IL"


In [10]:
# save cleaned data
data.to_csv('results-7-31-2019-cleaned.csv')