# Checking coverage of web scraped data collection.

In this notebook, we check that our webscraper captured all available data. Specifically, we compare the list of DBN's by years from the Demographics and Accountability snapshot to the DBN/years in our webscraped datasets.

In [1]:
import pandas as pd

data_2006 = pd.read_csv('raw_school_expenditures_by_year/year_2006.csv', index_col=0)
data_2007 = pd.read_csv('raw_school_expenditures_by_year/year_2007.csv', index_col=0)
data_2008 = pd.read_csv('raw_school_expenditures_by_year/year_2008.csv', index_col=0)
data_2009 = pd.read_csv('raw_school_expenditures_by_year/year_2009.csv', index_col=0)
data_2010 = pd.read_csv('raw_school_expenditures_by_year/year_2010.csv', index_col=0)
data_2011 = pd.read_csv('raw_school_expenditures_by_year/year_2011.csv', index_col=0)
data_2012 = pd.read_csv('raw_school_expenditures_by_year/year_2012.csv', index_col=0)

IOError: File all_school_expenditures_by_year/year_2006.csv does not exist

In [48]:
datasets= {2006:data_2006,2007:data_2007,2008:data_2008,2009:data_2009,2010:data_2010,2011:data_2011, 2012:data_2012}

In [49]:
def school_dbn_list():
    query = 'https://nycopendata.socrata.com/api/views/ihfw-zy9j/rows.csv?accessType=DOWNLOAD'
    Demo_and_Account = pd.read_csv(query)
    all_DBNs = Demo_and_Account[['DBN','schoolyear']]
    return all_DBNs

In [50]:
all_DBNs = school_dbn_list()

In [51]:
all_DBNs['schoolyear'] = all_DBNs['schoolyear'].map(lambda x: str(x)[4:])
all_DBNs['DBN'] = all_DBNs['DBN'].map(lambda x: str(x)[2:])
grouped = all_DBNs.groupby('schoolyear')

In [55]:
for year in range(2006,2013):
    print 'Year {} has {} missing school.'.format(year, (all_DBNs.loc[grouped.groups[str(year)],'DBN'].count()-datasets[year].shape[0]))

Year 2006 has 9 missing school.
Year 2007 has 20 missing school.
Year 2008 has 19 missing school.
Year 2009 has 4 missing school.
Year 2010 has 3 missing school.
Year 2011 has 1 missing school.
Year 2012 has 1 missing school.


In [56]:
missed_schools = {}
for year in range(2006,2013):
    DBN_targets = all_DBNs.loc[grouped.groups[str(year)],'DBN']
    missing = list(set(DBN_targets) - set(datasets[year].index))
    missed_schools[year] = missing
print missed_schools

{2006: ['X191', 'X512', 'X234', 'M099', 'M277', 'K564', 'K418', 'M551', 'K378'], 2007: ['M090', 'K480', 'X222', 'X512', 'M445', 'K470', 'X143', 'M277', 'M164', 'K659', 'K564', 'Q180', 'K440', 'K418', 'M275', 'K391', 'X435', 'M551', 'M535', 'X344'], 2008: ['Q420', 'M277', 'X512', 'X158', 'K469', 'K640', 'X184', 'X239', 'K033', 'K659', 'K564', 'X113', 'K435', 'K418', 'M839', 'K390', 'M462', 'M551', 'K479'], 2009: ['M551', 'X512', 'K564', 'K418'], 2010: ['M551', 'K564', 'K418'], 2011: ['K564', 'K418'], 2012: ['K564', 'K418']}


In [57]:
from urllib2 import urlopen, HTTPError, URLError
from lxml.html import parse
import re
import pandas as pd
import numpy as np
from retrying import retry

## define helper function hexencode to handle special characters in school name
def hexencode(matchobj):
    encoded = '%' + matchobj.group(0).encode('hex')
    return encoded

def retry_if_URL_error_not_HTTP_error(exception):
    '''Return True if we should retry (in this case when it's a URLError (i.e. network is down) and not an
    HTTPError (i.e. page doesn't exist)'''
    return (isinstance(exception, URLError) and not isinstance(exception, HTTPError))
            
##Retry decorator will retry every two seconds, for up to 10 seconds, if server side error
 
@retry(retry_on_exception=retry_if_URL_error_not_HTTP_error, wait_fixed=2000, stop_max_delay=10000)
def get_school_name_from_year_and_DBN(year, DBN):

    ##build initial query using year and DBN
    years = str(year-1) + '_' + str(year)
    query1 = ("https://www.nycenet.edu/offices/d_chanc_oper/budget/exp01/y" + years +
             "/function.asp?district=All&search=" + DBN +
             "&searchgo=Search&LCMS=**&GRANT=NO&cr1=All&cr2=All&cr3=All&cr4=All&R=1&prior=search")
    
    ##get school name by searching html page returned from query- if not found, return None.
    try:
        parsed1 = parse(urlopen(query1))
        doc=parsed1.getroot()
        options = doc.findall('.//option')
        
        print_next=False
        for option in options:
            if 'No Schools Found' in option.text_content():
                print option.text_content()
                return None
                
            elif 'School List' in option.text_content():
                print_next=True ##Then the next option includes the school name
                continue
                
            if print_next==True: ##So print it
                print option.text_content()
                print_next=False
            
            ##Schools in districts 1-32
            if re.match('District', option.text_content()):
                school_name = option.text_content()
                school_name = re.sub('--','',school_name)
                school_name = re.sub('District:\s','', school_name)
                school_name = re.sub('\s','+', school_name)
                school_name = re.sub('[^A-Za-z0-9\s+.]', hexencode, school_name)
                school_name = str(DBN) + school_name
                return school_name
            
            ##Schools in district 75- citywide special education
            elif re.match('Citywide', option.text_content()):
                school_name = option.text_content()
                school_name = re.split('--',school_name)[1]
                school_name = re.sub('\s','+',school_name)
                school_name = re.sub('\.\+?','+',school_name)
                ##Note even though it's district 75, it's coded as 97 in the url
                school_name = str(DBN) + str(97) + str(school_name)
                school_name = re.sub('[^A-Za-z0-9\s+.]', hexencode, school_name)
                return school_name 
            
            ##Schools in district 79- alternative HS's
            elif re.match('Alternative HS', option.text_content()):
                school_name = option.text_content()
                school_name = re.split('--',school_name)[1]
                school_name = re.sub('\s+','+',school_name)
                school_name = str(DBN) + str(79) + str(school_name)
                school_name = re.sub('[^A-Za-z0-9\s+.]', hexencode, school_name)
                return school_name
            
    except HTTPError as e:
        print e
        return None
    
    except URLError as y:
        print y
        return None
    
    except URLError as y:
        print y
        return None

In [59]:
results = {}
for year in missed_schools:
    print '\n'
    print 'Year = ', year
    for school in missed_schools[year]:
        get_school_name_from_year_and_DBN(year, school)



Year =  2006
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
District: 32--BUSHWICK COMMUNITY HIGH SCHOOL                    
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.


Year =  2007
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
District: 32--BUSHWICK COMMUNITY HIGH SCHOOL                    
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Results -- No Schools Found.
Search Resul