In [1]:
import pandas as pd
import numpy as np
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

## School Progress Report (SPR) Data

In [2]:
spr_yoy = pd.read_excel('data/SPR_SY1617_Data/SPR_YearoverYear_20180129.xlsx', \
                             sheet_name="Sheet1", skiprows=9, header=0)

spr_yoy['School ID'] = spr_yoy['School ID'].astype(str)
spr_yoy = spr_yoy[(spr_yoy["2017-18 Learning Network"] != "Closed")]

print(spr_yoy.shape)
spr_yoy.head(3)

(321, 31)


Unnamed: 0.1,Unnamed: 0,School ID,School Name,2017-18 Learning Network,Report Type (School Level),YOY Change (2015-16 to 2016-17),2016-2017 Achievement Score,2016-2017 Progress Score,2016-2017 Climate Score,2016-2017 College&Career Score,...,2013-2014 Achievement Score,2013-2014 Progress Score,2013-2014 Climate Score,2013-2014 College&Career Score,2013-2014 Overall Score,2012-2013 Achievement Score,2012-2013 Progress Score,2012-2013 Climate Score,2012-2013 College&Career Score,2012-2013 Overall Score
0,101HS,101,John Bartram High School,Neighborhood Network 1,HS,5.0,0.0,28.0,11.0,5.0,...,4.0,21.0,13.0,30.0,15,6,26,28,30,21
1,102HS,102,West Philadelphia High School,Turnaround Network,HS,8.0,0.0,44.0,14.0,12.0,...,3.0,25.0,15.0,32.0,17,3,10,3,19,7
2,103HS,103,High School of the Future,Neighborhood Network 2,HS,-1.0,3.0,27.0,46.0,17.0,...,8.0,4.0,38.0,41.0,16,12,20,67,59,32


In [3]:
def latestScores(row, score_type):
    year1 = str('2016-2017 ') + str(score_type)
    year2 = str('2015-2016 ') + str(score_type)
    year3 = str('2014-2015 ') + str(score_type)
    year4 = str('2013-2014 ') + str(score_type)
    year5 = str('2012-2013 ') + str(score_type)

    if not pd.isnull(row[year1]):
        score = row[year1]
    elif not pd.isnull(row[year2]):
        score = row[year2]
    elif not pd.isnull(row[year3]):
        score = row[year3]
    elif not pd.isnull(row[year4]):
        score = row[year4]
    elif not pd.isnull(row[year5]):
        score = row[year5]
    else:
        score = np.nan
        
    return score

spr_yoy['Achievement_Score_Final'] = spr_yoy.apply(latestScores, score_type='Achievement Score', axis=1)
spr_yoy['Progress_Score_Final'] = spr_yoy.apply(latestScores, score_type='Progress Score', axis=1)
spr_yoy['Climate_Score_Final'] = spr_yoy.apply(latestScores, score_type='Climate Score', axis=1)
spr_yoy['Overall_Score_Final'] = spr_yoy.apply(latestScores, score_type='Overall Score', axis=1)

In [4]:
# read SPR data
spr = pd.read_excel('data/SPR_SY1617_Data/SPR_SY1617_School_Metric_Scores_20180206.xlsx', sheet_name=1)

# convert School ID to string
spr = spr.rename(columns={'SRC School ID': 'School ID'})
spr['School ID'] = spr['School ID'].astype(str)
# spr.set_index(['School ID', 'Report'], inplace=True)

spr['Full Address'] = spr['Street Address'] + ', ' + spr['City'] + ', ' + spr['State'] + ' ' + spr['Zip Code'].astype(str)
spr.drop(['Street Address', 'City', 'State', 'Zip Code', 'Phone Number', 'Fax Number'], axis=1, inplace=True)
spr.head(3)

Unnamed: 0,School,School ID,Report,Rpt Type Long,Governance,Turnaround Model,Enrollment,Website,Grades Served,Admissions Type,...,Student Survey College & Career Score,Student Survey College & Career Pts Earn,Student Survey College & Career Pts Poss,Student Survey College & Career Pct Earn,Student Survey College & Career Tier,Teach Effect Distinguished Score,Teach Effect Instruction Score,Teacher Attendance Score,Student Survey Teaching Score,Full Address
0,John Bartram High School,101,HS,High School,District,,601,philasd.org/bartram/,'9-12',Neighborhood,...,19,0.19,1,19,INTERVENE,15,21,56,47,"2401 S 67th St, Philadelphia, PA 19142"
1,West Philadelphia High School,102,HS,High School,District,Turnaround,483,webgui.phila.k12.pa.us/schools/w/westphila,'9-12',Neighborhood,...,27,0.27,1,27,WATCH,11,29,56,42,"4901 Chestnut St, Philadelphia, PA 19139"
2,High School of the Future,103,HS,High School,District,,476,webgui.phila.k12.pa.us/schools/h/high-school-o...,'9-12',Citywide,...,0,0.0,1,0,INTERVENE,16,13,50,0,"4021 Parkside Ave, Philadelphia, PA 19104"


# Get GPS Coordinates
For each school, use the address to find the Latitude and Longitude Coordinates. This will let us place the schools on a map.

In [5]:
geolocator = Nominatim()
cell_start = time.time()

lat = pd.Series({})
lon = pd.Series({})

for index, row in spr.iterrows():
    addr = row['Full Address']

    if row['School ID'] not in lat:
        # try geocoder 3 times
        for i in range(0,3):
            try:
                location = geolocator.geocode(addr)
                break
            except GeocoderTimedOut:
                print("Geocoder Timed Out. Address =", addr)
                continue

        try:
            latitude = location.latitude
            longitude = location.longitude
        except AttributeError:
            latitude = np.nan
            longitude = np.nan

        lat = pd.concat([lat, pd.Series({row['School ID']: latitude})])
        lon = pd.concat([lon, pd.Series({row['School ID']: longitude})])
    
    else:
        continue
        
print('Cell Runtime:', round(time.time() - cell_start), 'seconds')

Cell Runtime: 133 seconds


In [6]:
print(lat.shape, sum(pd.isnull(lat)))
lat = lat.rename("Latitude")
lon = lon.rename("Longitude")

(294,) 44


In [8]:
df = pd.merge(spr, spr_yoy, how='inner', on='School ID')
df = pd.merge(df, lat.to_frame(), how='left', left_on='School ID', right_index=True)
df = pd.merge(df, lon.to_frame(), how='left', left_on='School ID', right_index=True)

df = df[['School','School ID', 'Full Address', 'Rpt Type Long','Governance', 'Achievement_Score_Final', \
         'Progress_Score_Final', 'Climate_Score_Final', 'Overall_Score_Final', 'Latitude', 'Longitude']]
df.head()

Unnamed: 0,School,School ID,Full Address,Rpt Type Long,Governance,Achievement_Score_Final,Progress_Score_Final,Climate_Score_Final,Overall_Score_Final,Latitude,Longitude
0,John Bartram High School,101,"2401 S 67th St, Philadelphia, PA 19142",High School,District,0.0,28.0,11.0,14,39.92147,-75.234018
1,West Philadelphia High School,102,"4901 Chestnut St, Philadelphia, PA 19139",High School,District,0.0,44.0,14.0,22,39.958587,-75.219431
2,High School of the Future,103,"4021 Parkside Ave, Philadelphia, PA 19104",High School,District,3.0,27.0,46.0,23,39.974678,-75.204826
3,Paul Robeson High School for Human Services,105,"4125 Ludlow St, Philadelphia, PA 19104",High School,District,0.0,60.0,61.0,41,39.956797,-75.204833
4,William L. Sayre High School,110,"5800 Walnut St, Philadelphia, PA 19139",High School,District,0.0,13.0,12.0,8,39.958371,-75.237623


## Serious Incidents (aka Safety)

In [9]:
si_1314 = pd.read_csv('data/Serious_Incidents/Serious_Incidents_2013-14.TXT')
si_1415 = pd.read_excel('data/Serious_Incidents/Serious_Incidents_2014-15.xlsx', sheet_name=0)
si_1415.rename(columns={'ULCS Code':'ULCS_NO', 
                        'School Year':'SCHOOL_YEAR', 
                        'Incident Type':'INCIDENT_TYPE',
                        'Incident Count':'INCIDENT_COUNT',
                        'School ID':'SCHOOL_ID'}, inplace=True)

si_1516 = pd.read_excel('data/Serious_Incidents/Serious_Incidents_2015-16.xls', sheet_name=0)
si_1617 = pd.read_excel('data/Serious_Incidents/Serious_Incidents_2016-17.xlsx', sheet_name=0)

si = pd.concat([si_1314, si_1415, si_1516, si_1617]).reset_index(drop=True)
si.drop('ULCS_NO', axis=1, inplace=True)
si['INCIDENT_TYPE'] = si['INCIDENT_TYPE'].str.upper()
si['INCIDENT_TYPE'].replace('SUICIDE - ATTEMPTS & THREATS', 'SUICIDE - ATTEMPTS AND THREATS', inplace=True)
si['INCIDENT_TYPE'].replace('DRUGS & ALCOHOL OFFENSE', 'DRUG AND ALCOHOL OFFENSES', inplace=True)
si['INCIDENT_TYPE'].replace('DRUG & ALCOHOL OFFENSES', 'DRUG AND ALCOHOL OFFENSES', inplace=True)
si['INCIDENT_TYPE'].replace('THREATS', 'THREAT', inplace=True)
si['INCIDENT_TYPE'].replace('THEFTS', 'THEFT', inplace=True)
si['INCIDENT_TYPE'].replace('BOMB SCARES', 'BOMB SCARE', inplace=True)
si['INCIDENT_TYPE'].replace('FIRES AND FALSE ALARMS', 'FIRE - INCENDIARY', inplace=True)
si['INCIDENT_TYPE'].replace('THREATS - VERBAL AND WRITTEN', 'THREAT', inplace=True)
si['INCIDENT_TYPE'].replace('WEAPONS OFFENSES', 'WEAPONS', inplace=True)

# convert School ID to string
si['SCHOOL_ID'] = si['SCHOOL_ID'].astype(str)

si.head()

Unnamed: 0,INCIDENT_COUNT,INCIDENT_TYPE,SCHOOL_ID,SCHOOL_YEAR
0,1,HARASSMENT,526,2013-2014
1,5,ASSAULTS,534,2013-2014
2,8,THREAT,535,2013-2014
3,0,TAKE OFF,537,2013-2014
4,0,DRUG AND ALCOHOL OFFENSES,545,2013-2014


In [10]:
si2 = pd.pivot_table(si, values='INCIDENT_COUNT', 
                   index='SCHOOL_ID', 
                   columns=['INCIDENT_TYPE'], 
                   aggfunc=np.sum).fillna(0).reset_index()

# convert School ID to string
si2['SCHOOL_ID'] = si2['SCHOOL_ID'].astype(str)


pd.set_option('display.max_columns', 500)
df2 = pd.merge(df, si2, how='left', left_on='School ID', right_on='SCHOOL_ID')
df2.drop('SCHOOL_ID', axis=1, inplace=True)

In [11]:
from sklearn import preprocessing

# scale 'recipient_cfscore' to be 0-1 so it matches our ideology score above
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,100))

df2['Safety_incidents'] = df2['ASSAULTS'] + df2['BULLYING'] + df2['HARASSMENT'] + df2['BURGLARY'] + df2['ROBBERY'] + df2['THEFT']

null_index = df2['Safety_incidents'].isnull() 
df2.loc[~null_index, 'SafetyScore'] = 100 - min_max_scaler.fit_transform(np.array(df2.loc[~null_index, 'Safety_incidents']).reshape(-1, 1))

df2.SafetyScore = df2.SafetyScore.round()


df2.head()

Unnamed: 0,School,School ID,Full Address,Rpt Type Long,Governance,Achievement_Score_Final,Progress_Score_Final,Climate_Score_Final,Overall_Score_Final,Latitude,Longitude,ASSAULTS,BOMB SCARE,BULLYING,BURGLARY,DISORDERLY CONDUCT,DRUG AND ALCOHOL OFFENSES,FIRE - INCENDIARY,GRAFFITI,HARASSMENT,MORALS OFFENSES,ROBBERY,SUICIDE,SUICIDE - ATTEMPTS AND THREATS,TAKE OFF,THEFT,THREAT,TRESPASSING,VANDALISM,WEAPONS,Safety_incidents,SafetyScore
0,John Bartram High School,101,"2401 S 67th St, Philadelphia, PA 19142",High School,District,0.0,28.0,11.0,14,39.92147,-75.234018,71.0,0.0,2.0,0.0,85.0,30.0,8.0,0.0,3.0,10.0,10.0,0.0,0.0,0.0,21.0,16.0,4.0,38.0,29.0,107.0,46.0
1,West Philadelphia High School,102,"4901 Chestnut St, Philadelphia, PA 19139",High School,District,0.0,44.0,14.0,22,39.958587,-75.219431,40.0,3.0,3.0,0.0,62.0,30.0,1.0,1.0,2.0,3.0,5.0,0.0,0.0,0.0,19.0,12.0,0.0,8.0,18.0,69.0,65.0
2,High School of the Future,103,"4021 Parkside Ave, Philadelphia, PA 19104",High School,District,3.0,27.0,46.0,23,39.974678,-75.204826,52.0,0.0,0.0,0.0,84.0,16.0,0.0,0.0,1.0,5.0,3.0,0.0,0.0,0.0,34.0,9.0,1.0,17.0,5.0,90.0,54.0
3,Paul Robeson High School for Human Services,105,"4125 Ludlow St, Philadelphia, PA 19104",High School,District,0.0,60.0,61.0,41,39.956797,-75.204833,1.0,0.0,0.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,1.0,5.0,97.0
4,William L. Sayre High School,110,"5800 Walnut St, Philadelphia, PA 19139",High School,District,0.0,13.0,12.0,8,39.958371,-75.237623,70.0,0.0,0.0,4.0,100.0,34.0,4.0,0.0,0.0,5.0,7.0,1.0,0.0,0.0,12.0,14.0,17.0,8.0,17.0,93.0,53.0


# Export Data

In [12]:
# save data
df2.to_csv("philly_school_data_v2.csv", sep='|', index=False)

In [13]:
# print(min(lat), max(lat))
# print(min(lon), max(lon))