In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv('../data/raw/2016 School Explorer.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum().to_frame().transpose()

## Cleaning data

In [None]:
# remove different values

df = df[df['Adjusted Grade'].isnull()]
df = df[df['New?'].isnull()]
df = df[df['Other Location Code in LCGMS'].isnull()]
df = df.drop(['Adjusted Grade', 'New?', 'Other Location Code in LCGMS'], axis=1)

# drop rows with any NA
# except when in the column School Income Estimate

sie = df['School Income Estimate']
df = df.drop(['School Income Estimate'], axis=1).dropna()
df['School Income Estimate'] = sie

In [None]:
df.shape

In [None]:
df.isnull().sum().to_frame().transpose()

## Formatting columns

Let's numerical values to numbers and so on.

In [None]:
df['Community School?'] = df['Community School?'].apply(lambda x: x == 'Yes')


def p_to_f(p):
    # percentage to float
    return float(p[:-1])

def r_to_v(r):
    # rating to value
    return {
        'Not Meeting Target': 1,
        'Approaching Target': 2,
        'Meeting Target': 3,
        'Exceeding Target': 4
    }[r]

def g_to_v(g):
    # grade to value
    if g == 'PK':
        return -1
    elif g == '0K':
        return 0
    else:
        return int (g)

def d_to_f(d):
    # dollar to float
    if isinstance(d, str):
        return float(d.replace('$', '').replace(',', ''))
    else:
        return np.nan

df['Percent ELL'] = df['Percent ELL'].apply(p_to_f)
df['Percent Asian'] = df['Percent Asian'].apply(p_to_f)
df['Percent Black'] = df['Percent Black'].apply(p_to_f)
df['Percent Hispanic'] = df['Percent Hispanic'].apply(p_to_f)
df['Percent Black / Hispanic'] = df['Percent Black / Hispanic'].apply(p_to_f)
df['Percent White'] = df['Percent White'].apply(p_to_f)

df['Student Attendance Rate'] = df['Student Attendance Rate'].apply(p_to_f)
df['Percent of Students Chronically Absent'] = df['Percent of Students Chronically Absent'].apply(p_to_f)

df['Rigorous Instruction %'] = df['Rigorous Instruction %'].apply(p_to_f)
df['Collaborative Teachers %'] = df['Collaborative Teachers %'].apply(p_to_f)
df['Supportive Environment %'] = df['Supportive Environment %'].apply(p_to_f)
df['Effective School Leadership %'] = df['Effective School Leadership %'].apply(p_to_f)
df['Strong Family-Community Ties %'] = df['Strong Family-Community Ties %'].apply(p_to_f)
df['Trust %'] = df['Trust %'].apply(p_to_f)

df['Rigorous Instruction Rating'] = df['Rigorous Instruction Rating'].apply(r_to_v)
df['Collaborative Teachers Rating'] = df['Collaborative Teachers Rating'].apply(r_to_v)
df['Supportive Environment Rating'] = df['Supportive Environment Rating'].apply(r_to_v)
df['Effective School Leadership Rating'] = df['Effective School Leadership Rating'].apply(r_to_v)
df['Strong Family-Community Ties Rating'] = df['Strong Family-Community Ties Rating'].apply(r_to_v)
df['Trust Rating'] = df['Trust Rating'].apply(r_to_v)

df['Student Achievement Rating'] = df['Student Achievement Rating'].apply(r_to_v)
df['School Income Estimate'] = df['School Income Estimate'].apply(d_to_f)
df['Grade Low'] = df['Grade Low'].apply(g_to_v)
df['Grade High'] = df['Grade High'].apply(g_to_v)

In [None]:
df.info(verbose=True)

In [None]:
df.select_dtypes(['bool']).head()

In [None]:
df.select_dtypes(['float64']).head()

In [None]:
df.select_dtypes(['int64']).head()

In [None]:
df.select_dtypes(['O']).head()