# (prototype) Extract datra from Schools Directory

Data is from Nov 2018, just after the SHSAT. This is no problem, since we will be collecting structural data, instead of student related data.

In [31]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [54]:
df = pd.read_csv('../data/raw/middle_school_directory.csv')
columns = ['schooldbn', 'communityschool', 'Borough', 'Latitude', 'Longitude']

# convert communityschool to integer (binary)
df['communityschool'] = df['communityschool'].str.startswith('Yes').astype(int)

# standardize boroughs
df['Borough'] = df['Borough'].fillna('')
df['Borough'] = df['Borough'].apply(lambda x: x.lower().strip())
df['Borough'] = df['Borough'].replace('staten is', 'staten_island')

# add missing info
# coordinates are not perfect, but they are good enough
missing_info = {
    '21K098': [ 'brooklyn', 40.583477, -73.953932],
    '05M046': ['manhattan', 40.831713, -73.936023],
    '10X308': [    'bronx', 40.885453, -73.878126],
}
for k, v in missing_info.items():
    df.loc[df['schooldbn'] == k, 'Borough'] = v[0]
    df.loc[df['schooldbn'] == k, 'Latitude'] = v[1]
    df.loc[df['schooldbn'] == k, 'Longitude'] = v[2]
    
# choose columns and set index
df = df[columns]
df = df.rename({
    'schooldbn': 'DBN',
    'communityschool': 'Community School?',
}, axis=1)
df = df.set_index('DBN')

df.head()

Unnamed: 0_level_0,Community School?,Borough,Latitude,Longitude
DBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
06M018,0,manhattan,40.871278,-73.911516
06M052,0,manhattan,40.866053,-73.924835
06M143,0,manhattan,40.848908,-73.930948
06M187,0,manhattan,40.856693,-73.937093
06M209,0,manhattan,40.820953,-73.951145


In [57]:
df['Community School?'].unique()

array([0, 1])

In [55]:
df.Borough.unique()

array(['manhattan', 'bronx', 'brooklyn', 'queens', 'staten_island'],
      dtype=object)

In [56]:
df.isnull().sum()

Community School?    0
Borough              0
Latitude             0
Longitude            0
dtype: int64

All perfect!