In [2]:
from urllib.request import urlopen
from json import loads
import pandas as pd
import pickle
import numpy as np

This file is designed to load all of the data and save the resulting DataFrames into pickle files. Because data is historical, it should not change so the pickle files can be used instead of running this code again.

A note on data:

Originally I tried to collect all of the data from Urban Education, however I ran into issues where 2019 graduation rates were missing a significant number of schools (for instance there were no schools listed in Seattle) and 2018 had all the schools but was missing graduation rates for a significant portion. I initally attempted to create a merged file using 2019 rates when available and 2018 rates when they were not, but multiple schools had redistricting between 2018 and 2019 which meant their NCESSCH numbers changed and I was unable to do proper matching. Ultimately I went to the source for the Urban Education data (EDFacts) to download the original dataset for graduation rates instead. 

In [3]:
# Load graduation rate data

# Source: https://www2.ed.gov/about/inits/ed/edfacts/data-files/index.html

url = 'https://www2.ed.gov/about/inits/ed/edfacts/data-files/acgr-sch-sy2018-19-long.csv'
grad_df = pd.read_csv(url)

In [4]:
# Load school status data

# Source: https://educationdata.urban.org/documentation/schools.html#ccd_directory

url = "https://educationdata.urban.org/api/v1/schools/ccd/directory/2019/"
response = urlopen(url)
status_data = loads(response.read())

status_df = pd.DataFrame.from_dict(status_data['results'])

# Loop through the school demographic data
next_link = status_data['next']

while next_link != None:
    response = urlopen(next_link)
    status_data = loads(response.read())
    temp_df = pd.DataFrame.from_dict(status_data['results'])
    status_df = pd.concat([status_df, temp_df])
    next_link = status_data['next']
    # Print the length at the end of the loop to ensure code is running properly
    print(len(status_df))

20000
30000
40000
50000
60000
70000
80000
90000
100000
101688


In [5]:
# Confirm that all observations loaded
status_data['count'] == len(status_df)

True

In [6]:
# Load census tract data - Urban Education

# Source: https://educationdata.urban.org/documentation/schools.html#nhgis-geographic-variables-2010-census-geographies

url = "https://educationdata.urban.org/api/v1/schools/nhgis/census-2010/2019/"
response = urlopen(url)
census_data = loads(response.read())

census_df = pd.DataFrame.from_dict(census_data['results'])

# Loop through the census data
next_link = census_data['next']

while next_link != None:
    response = urlopen(next_link)
    census_data = loads(response.read())
    temp_df = pd.DataFrame.from_dict(census_data['results'])
    census_df = pd.concat([census_df, temp_df])
    next_link = census_data['next']
    print(len(census_df))

20000
30000
40000
50000
60000
70000
80000
90000
100000
101688


In [7]:
# Confirm that all observations loaded
census_data['count'] == len(census_df)

True

In [8]:
# Load food access data
food_df = pd.read_csv('food_access_2019.csv')

In [9]:
# Save all files to pickle files

census_df.to_pickle(open('census_df.pkl', 'wb'))
status_df.to_pickle(open('status_df.pkl', 'wb'))
grad_df.to_pickle(open('grad_df.pkl', 'wb'))
food_df.to_pickle(open('food_df.pkl', 'wb'))