# (prototype) Preprocess school locations

Besides filtering out unused columns, we will create a nice column: Boroughs.

This will be based on coordinates on a map.

In [1]:
import json
import re

import pandas as pd
import shapely.geometry as geo

pd.set_option('display.max_columns', None)

In [17]:
df = pd.read_csv('../data/raw/school_locations_20172018.csv')

columns = [    
    'ATS SYSTEM CODE',
    'LOCATION_CODE',
    'LOCATION_NAME',
    'MANAGED_BY_NAME',
    'PRIMARY_BUILDING_CODE',
    'COMMUNITY_SCHOOL_SUP_NAME',
    'Location 1',
]
df = df[columns]

df['ATS SYSTEM CODE'] = df['ATS SYSTEM CODE'].str.strip()
df = df.set_index('ATS SYSTEM CODE')
df.index.name = 'DBN'

# remove school not present in NYC
df = df[df['Location 1'].notnull()]

df.head()

Unnamed: 0_level_0,LOCATION_CODE,LOCATION_NAME,MANAGED_BY_NAME,PRIMARY_BUILDING_CODE,COMMUNITY_SCHOOL_SUP_NAME,Location 1
DBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01M015,M015,P.S. 015 Roberto Clemente,DOE,M015,"PHILLIPS, DANIELLA","333 EAST 4 STREET\nMANHATTAN, NY 10009\n(40.72..."
01M019,M019,P.S. 019 Asher Levy,DOE,M019,"PHILLIPS, DANIELLA","185 1 AVENUE\nMANHATTAN, NY 10003\n(40.730009,..."
01M020,M020,P.S. 020 Anna Silver,DOE,M020,"PHILLIPS, DANIELLA","166 ESSEX STREET\nMANHATTAN, NY 10002\n(40.721..."
01M034,M034,P.S. 034 Franklin D. Roosevelt,DOE,M034,"PHILLIPS, DANIELLA","730 EAST 12 STREET\nMANHATTAN, NY 10009\n(40.7..."
01M063,M063,The STAR Academy - P.S.63,DOE,M063,"PHILLIPS, DANIELLA","121 EAST 3 STREET\nMANHATTAN, NY 10009\n(40.72..."


## Getting coordinates

In [22]:
def extract_coordinates(x):
    coordinates = map(float, re.findall(r'\((.*), (.*)\)$', x)[0])
    return tuple(coordinates)

coordinates = pd.DataFrame([extract_coordinates(x) for x in df['Location 1']], index=df.index, columns=['Latitude', 'Longitude'])
coordinates.loc['84X497', ['Latitude', 'Longitude']] = [40.816698, -73.918099]  # bad entry
coordinates.head()

Unnamed: 0_level_0,Latitude,Longitude
DBN,Unnamed: 1_level_1,Unnamed: 2_level_1
01M015,40.722075,-73.978747
01M019,40.730009,-73.984496
01M020,40.721305,-73.986312
01M034,40.726008,-73.975058
01M063,40.72444,-73.986214


## Getting Boroughs

In [31]:
with open('../data/raw/boroughs.geojson') as f:
    geojson = json.load(f)

In [32]:
for feature in geojson['features']:
    feature['polygon'] = geo.shape(feature['geometry'])

In [33]:
def get_borough(school):
    point = geo.Point(school['Longitude'], school['Latitude'])
    
    for feature in geojson['features']:
        polygon = feature['polygon']
        if polygon.contains(point):
            return feature['properties']['BoroName']

%time boroughs = coordinates.apply(get_borough, axis=1)

CPU times: user 18.3 s, sys: 9.94 ms, total: 18.3 s
Wall time: 18.3 s


In [35]:
boroughs.unique()

array(['Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island'],
      dtype=object)

In [34]:
boroughs.value_counts()

Brooklyn         566
Bronx            434
Queens           374
Manhattan        369
Staten Island     79
dtype: int64