Link: https://www.kaggle.com/passnyc/data-science-for-good

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('2016 School Explorer.csv')

In [3]:
df.head()

Unnamed: 0,Adjusted Grade,New?,Other Location Code in LCGMS,School Name,SED Code,Location Code,District,Latitude,Longitude,Address (Full),...,Grade 8 Math - All Students Tested,Grade 8 Math 4s - All Students,Grade 8 Math 4s - American Indian or Alaska Native,Grade 8 Math 4s - Black or African American,Grade 8 Math 4s - Hispanic or Latino,Grade 8 Math 4s - Asian or Pacific Islander,Grade 8 Math 4s - White,Grade 8 Math 4s - Multiracial,Grade 8 Math 4s - Limited English Proficient,Grade 8 Math 4s - Economically Disadvantaged
0,,,,P.S. 015 ROBERTO CLEMENTE,310100010015,01M015,1,40.721834,-73.978766,"333 E 4TH ST NEW YORK, NY 10009",...,0,0,0,0,0,0,0,0,0,0
1,,,,P.S. 019 ASHER LEVY,310100010019,01M019,1,40.729892,-73.984231,"185 1ST AVE NEW YORK, NY 10003",...,0,0,0,0,0,0,0,0,0,0
2,,,,P.S. 020 ANNA SILVER,310100010020,01M020,1,40.721274,-73.986315,"166 ESSEX ST NEW YORK, NY 10002",...,0,0,0,0,0,0,0,0,0,0
3,,,,P.S. 034 FRANKLIN D. ROOSEVELT,310100010034,01M034,1,40.726147,-73.975043,"730 E 12TH ST NEW YORK, NY 10009",...,48,1,0,0,0,0,0,0,0,0
4,,,,THE STAR ACADEMY - P.S.63,310100010063,01M063,1,40.724404,-73.98636,"121 E 3RD ST NEW YORK, NY 10009",...,0,0,0,0,0,0,0,0,0,0


# Data Cleaning

In [4]:
sns.set(style="darkgrid")

# Sort values by District

df = df.sort_values(by = 'District')

# Clean "School income estimate" by removing all dollar symbols

df['School Income Estimate'] = df['School Income Estimate'].str.replace('$', '')

# Clean "School income estimate" by removing all commas

df['School Income Estimate'] = df['School Income Estimate'].str.replace(',', '')

# Remove all rows in "School Income Estimate" and "Economic Need Index" with NaN

df = df[pd.notnull(df['School Income Estimate'])]
df = df[pd.notnull(df['Economic Need Index'])]
df = df[pd.notnull(df['Rigorous Instruction Rating'])]
df = df[pd.notnull(df['Collaborative Teachers Rating'])]
df = df[pd.notnull(df['Supportive Environment Rating'])]
df = df[pd.notnull(df['Effective School Leadership Rating'])]
df = df[pd.notnull(df['Strong Family-Community Ties Rating'])]
df = df[pd.notnull(df['Trust Rating'])]
df = df[pd.notnull(df['Student Achievement Rating'])]



# Change all "School Income Estimate" types to Float

df['School Income Estimate'] = pd.to_numeric(df['School Income Estimate'])

# Change all Percents to Decimals and Floats

def p2f(x):
    return float(x.strip('%'))/100
df['Percent ELL'] = df['Percent ELL'].apply(p2f)
df['Percent Asian'] = df['Percent Asian'].apply(p2f)
df['Percent Black'] = df['Percent Black'].apply(p2f)
df['Percent Hispanic'] = df['Percent Hispanic'].apply(p2f)
df['Percent Black / Hispanic'] = df['Percent Black / Hispanic'].apply(p2f)
df['Percent White'] = df['Percent White'].apply(p2f)
df['Student Attendance Rate'] = df['Student Attendance Rate'].apply(p2f)
df['Percent of Students Chronically Absent'] = df['Percent of Students Chronically Absent'].apply(p2f)
df['Rigorous Instruction %'] = df['Rigorous Instruction %'].apply(p2f)
df['Collaborative Teachers %'] = df['Collaborative Teachers %'].apply(p2f)
df['Supportive Environment %'] = df['Supportive Environment %'].apply(p2f)
df['Effective School Leadership %'] = df['Effective School Leadership %'].apply(p2f)
df['Strong Family-Community Ties %'] = df['Strong Family-Community Ties %'].apply(p2f)
df['Trust %'] = df['Trust %'].apply(p2f)

# Change grading scale froom not, approaching, meeting and exceeding target to 0, 1, 2, 3 respectively.

mapping = {'Not Meeting Target': 0, 'Approaching Target': 1, 'Meeting Target': 2, 'Exceeding Target': 3}
df = df.replace({'Rigorous Instruction Rating': mapping, 'Collaborative Teachers Rating': mapping, 'Supportive Environment Rating': mapping, 'Effective School Leadership Rating': mapping, 'Strong Family-Community Ties Rating': mapping, 'Trust Rating': mapping, 'Student Achievement Rating': mapping})

In [5]:
df.head()

Unnamed: 0,Adjusted Grade,New?,Other Location Code in LCGMS,School Name,SED Code,Location Code,District,Latitude,Longitude,Address (Full),...,Grade 8 Math - All Students Tested,Grade 8 Math 4s - All Students,Grade 8 Math 4s - American Indian or Alaska Native,Grade 8 Math 4s - Black or African American,Grade 8 Math 4s - Hispanic or Latino,Grade 8 Math 4s - Asian or Pacific Islander,Grade 8 Math 4s - White,Grade 8 Math 4s - Multiracial,Grade 8 Math 4s - Limited English Proficient,Grade 8 Math 4s - Economically Disadvantaged
0,,,,P.S. 015 ROBERTO CLEMENTE,310100010015,01M015,1,40.721834,-73.978766,"333 E 4TH ST NEW YORK, NY 10009",...,0,0,0,0,0,0,0,0,0,0
1188,,,,GIRLS PREPARATORY CHARTER SCHOOL OF NEW YORK,310100860866,84M330,1,40.71987,-73.977376,"442 E HOUSTON ST-RM 312 NEW YORK, NY 10002",...,44,3,0,1,0,0,0,0,0,2
1186,,,,MANHATTAN CHARTER SCHOOL,310100860873,84M320,1,40.71823,-73.984082,"100 ATTORNEY ST NEW YORK, NY 10002",...,0,0,0,0,0,0,0,0,0,0
21,,,,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",310100011539,01M539,1,40.7195,-73.979239,"111 COLUMBIA ST NEW YORK, NY 10002",...,0,0,0,0,0,0,0,0,0,0
17,,,,NEIGHBORHOOD SCHOOL,310100010363,01M363,1,40.724404,-73.98636,"121 E 3RD ST NEW YORK, NY 10009",...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.tail()

Unnamed: 0,Adjusted Grade,New?,Other Location Code in LCGMS,School Name,SED Code,Location Code,District,Latitude,Longitude,Address (Full),...,Grade 8 Math - All Students Tested,Grade 8 Math 4s - All Students,Grade 8 Math 4s - American Indian or Alaska Native,Grade 8 Math 4s - Black or African American,Grade 8 Math 4s - Hispanic or Latino,Grade 8 Math 4s - Asian or Pacific Islander,Grade 8 Math 4s - White,Grade 8 Math 4s - Multiracial,Grade 8 Math 4s - Limited English Proficient,Grade 8 Math 4s - Economically Disadvantaged
1171,,,,BUSHWICK ASCEND CHARTER SCHOOL,333200860987,84K793,32,40.693686,-73.90903,"751 KNICKERBOCKER AVE BROOKLYN, NY 11221",...,0,0,0,0,0,0,0,0,0,0
1092,,,,P.S. 086 THE IRVINGTON,333200010086,32K086,32,40.700748,-73.91766,"220 IRVING AVE BROOKLYN, NY 11237",...,0,0,0,0,0,0,0,0,0,0
1091,,,,P.S. 075 MAYDA CORTIELLA,333200010075,32K075,32,40.693254,-73.921321,"95 GROVE ST BROOKLYN, NY 11221",...,0,0,0,0,0,0,0,0,0,0
1099,,,,P.S. 274 KOSCIUSKO,333200010274,32K274,32,40.694186,-73.928157,"800 BUSHWICK AVE BROOKLYN, NY 11221",...,0,0,0,0,0,0,0,0,0,0
1096,,,,P.S. 145 ANDREW JACKSON,333200010145,32K145,32,40.701446,-73.932301,"100 NOLL ST BROOKLYN, NY 11206",...,0,0,0,0,0,0,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 795 entries, 0 to 1096
Columns: 161 entries, Adjusted Grade to Grade 8 Math 4s - Economically Disadvantaged
dtypes: float64(20), int64(130), object(11)
memory usage: 1006.2+ KB


In [8]:
# Remove First Three columns
df = df.drop(columns=['Adjusted Grade', 'New?', 'Other Location Code in LCGMS'])
df.head()

Unnamed: 0,School Name,SED Code,Location Code,District,Latitude,Longitude,Address (Full),City,Zip,Grades,...,Grade 8 Math - All Students Tested,Grade 8 Math 4s - All Students,Grade 8 Math 4s - American Indian or Alaska Native,Grade 8 Math 4s - Black or African American,Grade 8 Math 4s - Hispanic or Latino,Grade 8 Math 4s - Asian or Pacific Islander,Grade 8 Math 4s - White,Grade 8 Math 4s - Multiracial,Grade 8 Math 4s - Limited English Proficient,Grade 8 Math 4s - Economically Disadvantaged
0,P.S. 015 ROBERTO CLEMENTE,310100010015,01M015,1,40.721834,-73.978766,"333 E 4TH ST NEW YORK, NY 10009",NEW YORK,10009,"PK,0K,01,02,03,04,05",...,0,0,0,0,0,0,0,0,0,0
1188,GIRLS PREPARATORY CHARTER SCHOOL OF NEW YORK,310100860866,84M330,1,40.71987,-73.977376,"442 E HOUSTON ST-RM 312 NEW YORK, NY 10002",NEW YORK,10002,"0K,01,02,03,04,05",...,44,3,0,1,0,0,0,0,0,2
1186,MANHATTAN CHARTER SCHOOL,310100860873,84M320,1,40.71823,-73.984082,"100 ATTORNEY ST NEW YORK, NY 10002",NEW YORK,10002,"0K,01,02,03,04,05",...,0,0,0,0,0,0,0,0,0,0
21,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",310100011539,01M539,1,40.7195,-73.979239,"111 COLUMBIA ST NEW YORK, NY 10002",NEW YORK,10002,"0K,01,02,03,04,05,06,07,08,09,10,11,12",...,0,0,0,0,0,0,0,0,0,0
17,NEIGHBORHOOD SCHOOL,310100010363,01M363,1,40.724404,-73.98636,"121 E 3RD ST NEW YORK, NY 10009",NEW YORK,10009,"PK,0K,01,02,03,04,05",...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Reindex
df = df.reset_index()

In [10]:
df = df.drop(columns=['index'])
df.head()

Unnamed: 0,School Name,SED Code,Location Code,District,Latitude,Longitude,Address (Full),City,Zip,Grades,...,Grade 8 Math - All Students Tested,Grade 8 Math 4s - All Students,Grade 8 Math 4s - American Indian or Alaska Native,Grade 8 Math 4s - Black or African American,Grade 8 Math 4s - Hispanic or Latino,Grade 8 Math 4s - Asian or Pacific Islander,Grade 8 Math 4s - White,Grade 8 Math 4s - Multiracial,Grade 8 Math 4s - Limited English Proficient,Grade 8 Math 4s - Economically Disadvantaged
0,P.S. 015 ROBERTO CLEMENTE,310100010015,01M015,1,40.721834,-73.978766,"333 E 4TH ST NEW YORK, NY 10009",NEW YORK,10009,"PK,0K,01,02,03,04,05",...,0,0,0,0,0,0,0,0,0,0
1,GIRLS PREPARATORY CHARTER SCHOOL OF NEW YORK,310100860866,84M330,1,40.71987,-73.977376,"442 E HOUSTON ST-RM 312 NEW YORK, NY 10002",NEW YORK,10002,"0K,01,02,03,04,05",...,44,3,0,1,0,0,0,0,0,2
2,MANHATTAN CHARTER SCHOOL,310100860873,84M320,1,40.71823,-73.984082,"100 ATTORNEY ST NEW YORK, NY 10002",NEW YORK,10002,"0K,01,02,03,04,05",...,0,0,0,0,0,0,0,0,0,0
3,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",310100011539,01M539,1,40.7195,-73.979239,"111 COLUMBIA ST NEW YORK, NY 10002",NEW YORK,10002,"0K,01,02,03,04,05,06,07,08,09,10,11,12",...,0,0,0,0,0,0,0,0,0,0
4,NEIGHBORHOOD SCHOOL,310100010363,01M363,1,40.724404,-73.98636,"121 E 3RD ST NEW YORK, NY 10009",NEW YORK,10009,"PK,0K,01,02,03,04,05",...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Remove all columns 