# Checking coverage of web scraped data collection.

In this notebook, we check that our webscraper captured all available data. Specifically, we compare the list of DBN's by years from the Demographics and Accountability snapshot to the DBN/years in our webscraped datasets.

In [1]:
import pandas as pd

data_2006 = pd.read_csv('raw_school_expenditures_by_year/year_2006.csv', index_col=0)
data_2007 = pd.read_csv('raw_school_expenditures_by_year/year_2007.csv', index_col=0)
data_2008 = pd.read_csv('raw_school_expenditures_by_year/year_2008.csv', index_col=0)
data_2009 = pd.read_csv('raw_school_expenditures_by_year/year_2009.csv', index_col=0)
data_2010 = pd.read_csv('raw_school_expenditures_by_year/year_2010.csv', index_col=0)
data_2011 = pd.read_csv('raw_school_expenditures_by_year/year_2011.csv', index_col=0)
data_2012 = pd.read_csv('raw_school_expenditures_by_year/year_2012.csv', index_col=0)

In [2]:
datasets= {2006:data_2006,2007:data_2007,2008:data_2008,2009:data_2009,2010:data_2010,2011:data_2011, 2012:data_2012}

In [3]:
def school_dbn_list():
    query = 'https://nycopendata.socrata.com/api/views/ihfw-zy9j/rows.csv?accessType=DOWNLOAD'
    Demo_and_Account = pd.read_csv(query)
    all_DBNs = Demo_and_Account[['DBN','schoolyear']]
    return all_DBNs

In [4]:
all_DBNs = school_dbn_list()

In [5]:
all_DBNs['schoolyear'] = all_DBNs['schoolyear'].map(lambda x: str(x)[4:])
all_DBNs['DBN'] = all_DBNs['DBN'].map(lambda x: str(x)[2:])
grouped = all_DBNs.groupby('schoolyear')

In [6]:
for year in range(2006,2013):
    print 'Year {} has {} missing school.'.format(year, (all_DBNs.loc[grouped.groups[str(year)],'DBN'].count()-datasets[year].shape[0]))

Year 2006 has 9 missing school.
Year 2007 has 20 missing school.
Year 2008 has 16 missing school.
Year 2009 has 4 missing school.
Year 2010 has 3 missing school.
Year 2011 has 1 missing school.
Year 2012 has 1 missing school.


In [7]:
missed_schools = {}
for year in range(2006,2013):
    DBN_targets = all_DBNs.loc[grouped.groups[str(year)],'DBN']
    missing = list(set(DBN_targets) - set(datasets[year].index))
    missed_schools[year] = missing
print missed_schools

{2006: ['X191', 'X512', 'X234', 'M099', 'M277', 'K564', 'K418', 'M551', 'K378'], 2007: ['M090', 'K480', 'X222', 'X512', 'M445', 'K470', 'X143', 'M277', 'M164', 'K659', 'K564', 'Q180', 'K440', 'K418', 'M275', 'K391', 'X435', 'M551', 'M535', 'X344'], 2008: ['Q420', 'X512', 'X158', 'K469', 'K640', 'X184', 'M277', 'K659', 'K564', 'X113', 'K435', 'K418', 'K033', 'K390', 'M551', 'K479'], 2009: ['M551', 'X512', 'K564', 'K418'], 2010: ['M551', 'K564', 'K418'], 2011: ['K564', 'K418'], 2012: ['K564', 'K418']}


In [8]:
from urllib2 import urlopen, HTTPError, URLError
from lxml.html import parse
import re
import pandas as pd
import numpy as np
from retrying import retry

## define helper function hexencode to handle special characters in school name
def hexencode(matchobj):
    encoded = '%' + matchobj.group(0).encode('hex')
    return encoded

def retry_if_URL_error_not_HTTP_error(exception):
    '''Return True if we should retry (in this case when it's a URLError (i.e. network is down) and not an
    HTTPError (i.e. page doesn't exist)'''
    return (isinstance(exception, URLError) and not isinstance(exception, HTTPError))
            
##Retry decorator will retry every two seconds, for up to 10 seconds, if server side error
 
@retry(retry_on_exception=retry_if_URL_error_not_HTTP_error, wait_fixed=2000, stop_max_delay=10000)
def get_school_name_from_year_and_DBN(year, DBN):

    ##build initial query using year and DBN
    years = str(year-1) + '_' + str(year)
    query1 = ("https://www.nycenet.edu/offices/d_chanc_oper/budget/exp01/y" + years +
             "/function.asp?district=All&search=" + DBN +
             "&searchgo=Search&LCMS=**&GRANT=NO&cr1=All&cr2=All&cr3=All&cr4=All&R=1&prior=search")
    
    ##get school name by searching html page returned from query- if not found, return None.
    try:
        parsed1 = parse(urlopen(query1))
        doc=parsed1.getroot()
        options = doc.findall('.//option')
        
        print_next=False
        for option in options:
            if 'No Schools Found' in option.text_content():
                print option.text_content()
                return None
                
            elif 'School List' in option.text_content():
                print_next=True ##Then the next option includes the school name
                continue
                
            if print_next==True: ##So print it
                print option.text_content()
                print_next=False
            
            ##Schools in districts 1-32
            if re.match('District', option.text_content()):
                school_name = option.text_content()
                school_name = re.sub('--','',school_name)
                school_name = re.sub('District:\s','', school_name)
                school_name = re.sub('\s','+', school_name)
                school_name = re.sub('[^A-Za-z0-9\s+.]', hexencode, school_name)
                school_name = str(DBN) + school_name
                return school_name
            
            ##Schools in district 75- citywide special education
            elif re.match('Citywide', option.text_content()):
                school_name = option.text_content()
                school_name = re.split('--',school_name)[1]
                school_name = re.sub('\s','+',school_name)
                school_name = re.sub('\.\+?','+',school_name)
                ##Note even though it's district 75, it's coded as 97 in the url
                school_name = str(DBN) + str(97) + str(school_name)
                school_name = re.sub('[^A-Za-z0-9\s+.]', hexencode, school_name)
                return school_name 
            
            ##Schools in district 79- alternative HS's
            elif re.match('Alternative HS', option.text_content()):
                school_name = option.text_content()
                school_name = re.split('--',school_name)[1]
                school_name = re.sub('\s+','+',school_name)
                school_name = str(DBN) + str(79) + str(school_name)
                school_name = re.sub('[^A-Za-z0-9\s+.]', hexencode, school_name)
                return school_name
            
    except HTTPError as e:
        print e
        return None
    
    except URLError as y:
        print y
        return None
    
    except URLError as y:
        print y
        return None

In [9]:
results = {}
for year in missed_schools:
    print '\n'
    print 'Year = ', year
    for school in missed_schools[year]:
        get_school_name_from_year_and_DBN(year, school)



Year =  2006
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
District: 32--BUSHWICK COMMUNITY HIGH SCHOOL                    
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.


Year =  2007
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
District: 32--BUSHWICK COMMUNITY HIGH SCHOOL                    
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Resul

Thus, it we apparently are only missing Bushwick Community High School- all other missing schools (missing in the sense that they are present in the Demographic and Accountability Data but not in our scraped data) are in fact not available through the expenditure report tool.

Let's take a look at what's happening with the Bushwick Community High School. This is the url for the expenditure report for 2012:

https://www.nycenet.edu/offices/d_chanc_oper/budget/exp01/y2011_2012/function.asp?district=All&search=k564&LCMS=K56432Bushwick+Community+High+School&schoolgo=Go&GRANT=NO&cr1=All&cr2=All&cr3=All&cr4=All&R=1&prior=search

Let's see what our method would produce:

In [10]:
@retry(retry_on_exception=retry_if_URL_error_not_HTTP_error, wait_fixed=2000, stop_max_delay=10000)
def get_all_school_data(year,DBN):
    
    school_name = get_school_name_from_year_and_DBN(year, DBN)
    if school_name is None:
        return None

    ##if found, use school name to build new query to get expenditure report for year:
    years = str(year-1) + '_' + str(year)
    query2 = ("https://www.nycenet.edu/offices/d_chanc_oper/budget/exp01/y" + years +
             "/function.asp?district=All&search=" + DBN + "&LCMS=" + school_name +
             "&schoolgo=Go&GRANT=NO&cr1=All&cr2=All&cr3=All&cr4=All&R=1&prior=search")
    
    try:
        parsed = parse(urlopen(query2))
        doc = parsed.getroot()
        tables = doc.findall('.//table')
        
        ##Table 5 data
        rows = tables[4].findall('.//tr')
        elts = rows[2].findall('.//td')
        
        ## This will work for Districts 1-32, but not 75 or 79:
        if int(school_name[4:6]) in range(1,33): 
            for val in elts[0]:
                table_5_dat = val.text_content()
                table_5_dat = re.split('[\xa0]+[\s]?',table_5_dat)
                school_features = {}
                for i in table_5_dat:
                    school_features[str(re.sub('\s','_', re.split(':\s',i)[0]))]=str(re.split(':[\s]+',i)[1])
        
        ## So we use except block to catch districts 75 and 79
        else:
            for val in elts[0]:
                table_5_dat = val.text_content()
                table_5_dat = re.split('[\s]?[\xa0]+[\s]?',table_5_dat)
                school_features = {}
                school_features['District'] = str(re.sub('\s', '_', table_5_dat[0]))
                for i in table_5_dat[1:]:
                    school_features[str(re.sub('\s','_', re.split(':\s',i)[0]))]=str(re.split(':[\s]+',i)[1])
        
        ## Now we need to convert the text district names to numeric district codes:
        if school_features['District'] == 'Citywide_Sp_Ed_(75)':
            school_features['District'] = 75
        elif  school_features['District'] == 'Alternative_HS':
            school_features['District'] = 79
        
        ##Make expenditure dict:
        expenditures = {}
        
        ##Table 7 data
        rows = tables[6].findall('.//tr')
        for row in rows:
            elts = row.findall('.//td')
            for pair in zip(elts[0],elts[3]):
                key = re.sub('[\xa0]+','_', pair[0].text_content())
                key = re.sub('.*?\._','',key)
                key = re.sub('[^A-Za-z_]','',key)
                expenditures[key] = re.sub('[^0-9]','', pair[1].text_content())
                
                
        ##Table 10 data
        rows = tables[9].findall('.//tr')
        for row in rows:
            elts = row.findall('.//td')
            for pair in zip(elts[0],elts[3]):
                key = re.sub('[\xa0]+','_', pair[0].text_content())
                key = re.sub('.*?\._','',key)
                key = re.sub('[^A-Za-z_]','',key)
                expenditures[key] = re.sub('[^0-9]','', pair[1].text_content())
                                           
    ## Note- @retry wrapper will try if URLerror but not HTTPError (so bad network connection handled)
    ## If HTTPError, we want to just return- the page doesn't exisT
    
    except HTTPError:
        return
    
    ## If data not available (or not in expected format), we'll through an IndexError:
    
    except IndexError:
        return
    
    ##clean up expenditures- read in one null value that should be deleted
    
    del expenditures['_']
    
    ##join two dictionaries and return as series
    school_features.update(expenditures)
    school_features = pd.Series(school_features.values(), index=school_features.keys(), name=DBN)
    return school_features

In [11]:
get_all_school_data(2012, 'K564')

District: 32--Bushwick Community High School


Instructional_Support_Srcs_All_Funds                   3163
Central_Instructional_Support_All_Funds                  81
Attendance__Outreach_Services                           370
District                                                 32
Counseling_Services                                    1707
Retiree_Health_and_Welfare                              466
Related_Services                                         81
Field_Support_All_Funds                                 117
Instructional_Offices                                    89
Debt_Service                                            671
Drug_Prevention_Programs                                  0
Principals                                              618
Librarians_and_Library_Books                             18
Contracted_Instructional_Services                       703
Text_Books                                               50
Central_Administration_All_Funds                        388
Assistant_Principals                    

Since we are able to read in the data for K564, we know the error is not in how we scraped the data, but instead must be in how we aggregated and returne the data. One hypothesis is that 'K564' is eiher the first or last DBN in the set. Let's see if that's true:

In [17]:
for year in range(2006,2013):
    print all_DBNs.loc[grouped.groups[str(2012)]]

        DBN schoolyear
6      M015       2012
13     M019       2012
20     M020       2012
27     M034       2012
35     M063       2012
42     M064       2012
49     M110       2012
56     M134       2012
63     M137       2012
70     M140       2012
77     M142       2012
84     M184       2012
91     M188       2012
98     M292       2012
105    M301       2012
112    M315       2012
119    M332       2012
125    M345       2012
132    M361       2012
139    M363       2012
146    M364       2012
150    M378       2012
157    M448       2012
164    M450       2012
166    M458       2012
173    M509       2012
180    M515       2012
187    M539       2012
194    M650       2012
201    M696       2012
...     ...        ...
9880   R600       2012
9887   R605       2012
9890   R861       2012
9897   K045       2012
9904   K075       2012
9911   K086       2012
9918   K106       2012
9925   K116       2012
9932   K123       2012
9939   K145       2012
9946   K151       2012
9953   K162

So, in fact, 'K564' is the last DBN. Hence, we're missing this school because of how we're building the data frame. Let's look at that code- note we're modifying the code to test it's performance on a reduced list of schools:

In [28]:
def test_build_expenditure_dataframe(year):
    unique_DBNs = ['K554', 'K556', 'K564']
    results_for_year = get_all_school_data(year, unique_DBNs[0])
    ## previously for DBN in unique_DBNs[1:-1]
    for DBN in unique_DBNs[1:]:
        print DBN
        school_results = get_all_school_data(year, DBN)
        results_for_year = pd.concat([results_for_year, school_results], axis=1)
    results_for_year = results_for_year.T
    return results_for_year

In [29]:
results = test_build_expenditure_dataframe(2012)

District: 32--All City Leadership Secondary School
K556
District: 32--Bushwick Leaders High School for Academic Excellen
K564
District: 32--Bushwick Community High School


In [30]:
results

Unnamed: 0,Instructional_Support_Srcs_All_Funds,Central_Instructional_Support_All_Funds,Attendance__Outreach_Services,District,Counseling_Services,Retiree_Health_and_Welfare,Related_Services,Field_Support_All_Funds,Instructional_Offices,Debt_Service,...,Professional_Development,Other_SystemWide_Obligations_All_Funds,Computer_System_Support_School_Level,Secretaries_School_Aides__Other_Support_Staff,Instructional_Support_and_Administration_All_Funds,Summer_and_Evening_School,Other_Field_Support_Costs_All_Funds,Total,Title_1,Food_Services
K554,1326,74,35,32,428,466,141,90,86,671,...,620,1145,65,825,318,40,48,14875,Yes,670
K556,2139,89,112,32,291,466,1144,118,91,671,...,318,1145,65,748,342,9,26,18445,Yes,313
K564,3163,81,370,32,1707,466,81,117,89,671,...,2126,1145,65,659,245,82,20,19759,Yes,176


The issue was that we specified unique_DBNs[1:-1], and thus missed the last school. We (instead) need to specify [1:] to get the rest of the data.