## Clean Up : School Demographic Info (schools.csv)


In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [3]:
# Load schools.csv (raw data)
sdf_v0  = pd.read_csv("tempdata/schools.csv")
sdf_v0.shape


In [4]:
sc_drdf = pd.read_csv("tempdata/schools_rc.csv") # schools.csv column [d]rop [r]ename
sdf_v1  = sdf_v0.copy(deep=True)


In [5]:
for index, row in sc_drdf.iterrows():
    current_colname = str(row['Raw Column Name'])
    new_colname     = str(row['New Column Name'])
    if new_colname == "drop":
        #print "Dropping : ", current_colname
        sdf_v1.drop(current_colname, axis=1, inplace=True)
    else :
        #print "Renaming : ", current_colname, " --> ", new_colname
        sdf_v1.rename(columns={current_colname : new_colname    }, inplace=True)

sdf_v1.shape    

(99074, 333)

##### School Demographics : Column Notations 

| Notation       | Description                              |
| :------------- | ---------------------------------------- |
| pk             | School Year : Prekindergarten            |
| k              | School Year : Kindergarten               |
| g (g1,g2… g12) | School Year : Grade (Grade1, Grade2… Grade12) |
| ug             | School Year : Ungraded                   |
| an             | Race : American Indian/Asian Native      |
| a              | Race : Asian/Pacific Islander            |
| h              | Race : Hispanic                          |
| b              | Race : Black                             |
| w              | Race : White                             |
| hp             | Race : Hawaiian Native/Pacific Islander  |
| mr             | Race : Two or more races                 |
| m              | Sex : Male                               |
| f              | Sex : Female                             |



In [63]:
#sdf_v1.to_csv("tempdata/schools_v1.csv")
#sdf_v1 = pd.read_csv("tempdata/schools_v1.csv")

In [6]:
# Trimming cells that contain :
# ="abc" to abc
#  "abc" to abc
for i, col in enumerate(sdf_v1.columns):
    if sdf_v1[col].dtype == object:
        sdf_v1[col] = sdf_v1[col].str.replace('="', '')
        sdf_v1[col] = sdf_v1[col].str.replace('"', '')
   

In [8]:
# Replacing Missing Data / NA / Bad Quality data with NaN
# Ref : http://pandas.pydata.org/pandas-docs/version/0.15.2/missing_data.html

sdf_v1 = sdf_v1.replace('\xe2\x80\x93', np.nan) # Replace "-" (Missing Data) with NaN
sdf_v1 = sdf_v1.replace('\xe2\x80\xa0', np.nan) # Replace "†" (Not Applicable) with NaN
sdf_v1 = sdf_v1.replace('\xe2\x80\xa1', np.nan) # Replace "‡" (Bad Quality) with NaN


In [9]:
# Converting some columns to boolean data types by replacing "1-Yes", "2-No" to True & False resp.
tfmap =  {'1-Yes': True, '2-No': False}
vtfmap = {'A virtual school': True, 'Not a virtual school': False} # virtual_school
ltfmap = {'Yes under Community Eligibility Option (CEO)': True,
          'Yes participating without using any Provision or the CEO': True, 
          'Yes under Provision 3': True,
          'Yes under Provision 2': True,
          'Yes under Provision 1': True,
          'No': False }

sdf_v1 = sdf_v1.replace(tfmap)
sdf_v1 = sdf_v1.replace(vtfmap)
sdf_v1 = sdf_v1.replace(ltfmap)


In [11]:
sdf_v1.to_csv("tempdata/schools_v2.csv")
sdf_v2 = pd.read_csv("tempdata/schools_v2.csv")


In [12]:
for i, col in enumerate(sdf_v2.columns):
    print i," : ", col, " : ", sdf_v2[col].dtype

0  :  Unnamed: 0  :  int64
1  :  ﻿School Name  :  object
2  :  state  :  object
3  :  state_abbr  :  object
4  :  school_id_nces  :  int64
5  :  agency_name  :  object
6  :  agency_id_nces  :  int64
7  :  county  :  object
8  :  data_available  :  object
9  :  data_na  :  object
10  :  city  :  object
11  :  zipcode  :  int64
12  :  school_type  :  object
13  :  agency_type  :  object
14  :  operational  :  object
15  :  charter_school  :  object
16  :  magnet_school  :  object
17  :  shared_time_school  :  object
18  :  location_type  :  object
19  :  title1_school_wide  :  object
20  :  title1_eligible  :  object
21  :  title1_status  :  object
22  :  latitude  :  float64
23  :  longitude  :  float64
24  :  school_id_state  :  object
25  :  agency_id_state  :  object
26  :  congressional_code  :  float64
27  :  virtual_school  :  object
28  :  lunch_program  :  object
29  :  offered_g_lowest  :  object
30  :  offered_g_highest  :  object
31  :  offered_pk  :  bool
32  :  offered_k  :