In [1]:
import pandas as pd
import json
from numpy import nan
from geopy import distance
from pandas.io.json import json_normalize
from shapely.geometry import shape, Point, MultiPoint
from shapely.ops import nearest_points

In [2]:
with open('./results-11-30-2019.json') as f:
    # flatten json structures
    data = json_normalize(json.load(f))
    # Remove non-Chicago entries
    data = data[data['address'].str.contains('Chicago, IL')]

In [3]:
data = data.rename(columns={
    'latLong.longitude': 'lon', 
    'latLong.latitude': 'lat',
    'hdpData.homeInfo.yearBuilt': 'yearBuilt',
    'hdpData.homeInfo.homeType': 'homeType'
    }, errors='raise')

In [4]:
data = data[['price', 'beds', 'baths', 'area', 'lat', 'lon', 'yearBuilt', 'homeType']]

In [5]:
with open('./CTA_RailStations.geojson') as f:
    stations_geo = json.load(f)

In [6]:
with open('./chicago_neighborhoods.geojson') as f:
    neighborhoods_geo = json.load(f)

In [7]:
# Make the neighborhood json easier to work with
neighborhood_polygons = [{'geometry': shape(feature['geometry']), 'properties': feature['properties']} for feature in neighborhoods_geo['features']]

In [8]:
def get_neighborhood(row):
    lon = row['lon']
    lat = row['lat']
    hoods = [polygon['properties']['pri_neigh'] for polygon in neighborhood_polygons if polygon['geometry'].contains(Point(lon, lat))]
    return hoods[0] if len(hoods) > 0 else None

# Add neighborhoods to entries
data['neighborhood'] = data.apply(get_neighborhood, axis=1)

In [9]:
# Remove anything not in a neighborhood (probably incorrectly labeled as being in Chicago)
data = data[data['neighborhood'].notnull()]
data[['lat', 'lon', 'neighborhood']].sample(20)

Unnamed: 0,lat,lon,neighborhood
13518,42.00934,-87.660854,Rogers Park
2310,41.744926,-87.730642,Ashburn
11396,41.961521,-87.69667,Albany Park
10860,41.955418,-87.833368,Dunning
7218,41.883794,-87.652708,West Loop
11232,41.960189,-87.7201,Irving Park
8776,41.913144,-87.687682,Wicker Park
7605,41.894355,-87.627675,River North
2072,41.732006,-87.590111,Calumet Heights
11081,41.958872,-87.770691,Portage Park


In [10]:
# Convert prices to ints, drop any entries without prices
data['price'] = data.apply(lambda row: nan if (price:=row['price'].replace('$', '').replace(',', '')) == '' else int(price), axis=1)

print(f'Records before drop: {data.shape}')
data = data.dropna()
data = data[data['yearBuilt'] > 0]
print(f'Records after drop: {data.shape}')
data.sample(20)

Records before drop: (10277, 9)
Records after drop: (7436, 9)


Unnamed: 0,price,beds,baths,area,lat,lon,yearBuilt,homeType,neighborhood
12696,970000.0,4.0,4.0,3604.0,41.98739,-87.741593,1923,SINGLE_FAMILY,"Sauganash,Forest Glen"
3442,239900.0,4.0,2.0,1181.0,41.798627,-87.719985,1944,SINGLE_FAMILY,West Elsdon
10168,565000.0,3.0,2.0,1850.0,41.939774,-87.684171,2005,CONDO,North Center
8442,289900.0,3.0,2.0,858.0,41.920919,-87.749551,1911,SINGLE_FAMILY,Belmont Cragin
9144,1249000.0,2.0,4.0,3826.0,41.906923,-87.630967,2018,TOWNHOUSE,Gold Coast
10556,1490000.0,2.0,2.0,1569.0,41.929235,-87.639921,2010,CONDO,Lincoln Park
538,134900.0,4.0,2.0,1400.0,41.681386,-87.656288,1961,SINGLE_FAMILY,West Pullman
7199,339000.0,1.0,1.0,818.0,41.892082,-87.628862,2003,CONDO,River North
8753,899000.0,5.0,5.0,1.0,41.920921,-87.681767,1892,MULTI_FAMILY,Bucktown
4218,595000.0,4.0,4.0,3000.0,41.817814,-87.615533,2007,SINGLE_FAMILY,Grand Boulevard


In [11]:
def make_point(point):
    lon = point['geometry']['coordinates'][0]
    lat = point['geometry']['coordinates'][1]
    return (lon, lat), {
            'coords': Point(lon, lat),
            'line': point['properties']['LINES'],
            'station': point['properties']['Name']
        }
        

stations = {(p:=make_point(point))[0]: p[1] for point in stations_geo['features']}
station_points = [station['coords'] for station in stations.values()]

In [12]:
# Get nearest CTA stations
stations_multipoint = MultiPoint(station_points)
def get_nearest_station(row):
    lon = row['lon']
    lat = row['lat']
    p = Point(lon, lat)
    np = nearest_points(p, MultiPoint(station_points))
    # Index 0 is the input point, index 1 is the nearest station
    nearest_station = stations[(np[1].x, np[1].y)]
    return nearest_station

data = pd.concat([data, data.apply(get_nearest_station, axis=1, result_type='expand')], axis=1)

data[['coords', 'line', 'station', 'neighborhood']].sample(20)

Unnamed: 0,coords,line,station,neighborhood
10318,POINT (-87.6528658377562 41.92505136765153),"Red, Brown, Purple (Express)",Fullerton,Lincoln Park
9167,POINT (-87.63141229300709 41.90392031189563),Red Line,Clark/Division,Gold Coast
1960,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,Washington Heights
9206,POINT (-87.63141229300709 41.90392031189563),Red Line,Clark/Division,Gold Coast
6651,POINT (-87.77413525552275 41.88729278376865),Green Line (Lake),Austin,Austin
7768,POINT (-87.62802125739587 41.89166520044598),Red Line,Grand,Streeterville
9329,POINT (-87.65264430600405 41.91821656715077),"Brown, Purple (Express)",Armitage,Old Town
7450,POINT (-87.65521427917493 41.89607523646576),Blue Line,Chicago,West Town
2122,POINT (-87.62441474538349 41.72237598330533),Red Line,95/Dan Ryan,Chatham
10019,POINT (-87.69688979878789 41.92193917056461),Blue Line (O'Hare),California,Logan Square


In [13]:
def manhattan_distance(row):
    home_lon = row['lon']
    home_lat = row['lat']
    station_lon = row['coords'].x
    station_lat = row['coords'].y

    # Calculate Haversine distance (https://en.wikipedia.org/wiki/Haversine_formula) 
    # using Manhattan distance (https://xlinux.nist.gov/dads/HTML/manhattanDistance.html)
    return distance.distance((home_lat, home_lon), (station_lat, home_lon)).miles + \
        distance.distance((home_lat, home_lon), (home_lat, station_lon)).miles

data['stationDistance'] = data.apply(manhattan_distance, axis=1)
data[['lon', 'lat', 'coords', 'stationDistance', 'station']].sample(20)

Unnamed: 0,lon,lat,coords,stationDistance,station
2383,-87.674322,41.761691,POINT (-87.66384484844021 41.77895340470035),1.732747,Ashland
11630,-87.645652,41.950426,POINT (-87.65362593077103 41.94742799653081),0.617739,Addison
13501,-87.683035,42.013449,POINT (-87.6728924507902 42.01906321923584),0.909521,Howard
7357,-87.655811,41.882335,POINT (-87.65212799206752 41.88557678778864),0.41369,Morgan
8174,-87.625121,41.899641,POINT (-87.62817635256233 41.89667121189094),0.362504,Chicago
10931,-87.800746,41.951898,POINT (-87.80889493503285 41.98232336946857),2.51971,Harlem
7229,-87.626931,41.889047,POINT (-87.62802125739587 41.89166520044598),0.236924,Grand
10187,-87.661911,41.929841,POINT (-87.65313085718887 41.93273150006873),0.652001,Diversey
7835,-87.62159,41.88656,POINT (-87.62618878473235 41.88322001199881),0.467684,Washington/Wabash
10103,-87.668546,41.928106,POINT (-87.67088394357924 41.94370914759255),1.197377,Paulina


In [14]:
# Turn neighborhood and home type data into numeric form
data = pd.concat([data, pd.get_dummies(data['neighborhood'], drop_first=True)], axis=1)
data = pd.concat([data, pd.get_dummies(data['homeType'], drop_first=True)], axis=1)

In [15]:
# save cleaned data
data.to_csv('results-11-30-2019-cleaned.csv')