# (prototype) Visual Schools 2017

This notebook will transform the schools2017 DataFrame into one that can be better visualized.

In [None]:
# imports


import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [None]:
# functions


def standardize(values):   
    x = np.array(values)
    
    # extract non-na values
    x = x[~np.isnan(x)]
    assert len(x) > 0
    
    # compute statistics
    mean = x.mean()
    std = x.std()
    
    return (values - mean) / std

f_0 = lambda f: '{:.0f}'.format(f)
f_2 = lambda f: '{:.2f}'.format(f)
f_pct = lambda f: '{:.0%}'.format(f)

In [None]:
# groups of columns


standardize_columns = [
    'Mean Scale Score - ELA',
    'Mean Scale Score - Math',
]

scale_pct_columns = [
    '% Level 2 - ELA',
    '% Level 3 - ELA',
    '% Level 4 - ELA',
    '% Level 2 - Math',
    '% Level 3 - Math',
    '% Level 4 - Math',
]

f0_columns = [
    '# Students in HS Admissions',
]

f2_columns = [
    'Latitude',
    'Longitude',
    'Mean Scale Score - ELA',
    'Mean Scale Score - Math',
]

pct_columns = [
    'Percent Asian',
    'Percent Black',
    'Percent Hispanic',
    'Percent White',
    'Percent Other',
    'Percent English Language Learners',
    'Percent Students with Disabilities',
    'Percent of Students Chronically Absent',
    'Economic Need Index',
    '% Level 2 - ELA',
    '% Level 3 - ELA',
    '% Level 4 - ELA',
    '% Level 2 - Math',
    '% Level 3 - Math',
    '% Level 4 - Math',
]

In [None]:
# core


df = pd.read_pickle('../data/process/schools2017.pkl')

# unique oprations
df['Charter School?'] = df['Charter School?'].apply(lambda x: 'Yes' if x else 'No')
df['Borough'] = df['Borough'].str.capitalize()
df['Borough'] = df['Borough'].apply(lambda x: 'Staten Island' if x == 'Staten_island' else x)
df = df.loc[:, :'# Students in HS Admissions']

# operations on multiple columns
for c in scale_pct_columns:
    df[c] = df[c] / 100.0
    
for c in standardize_columns:
    df[c] = standardize(df[c])

for c in f0_columns:
    df[c] = df[c].apply(f_0)

for c in f2_columns:
    df[c] = df[c].apply(f_2)
    
for c in pct_columns:
    df[c] = df[c].apply(f_pct)

In [None]:
df.head()