# This notebook is used to setup data collection for schooldigger.com API
It also performs other tasks using Google maps API to get schools latitude and longitude

In [24]:
# Import the required libraries
import pandas as pd
import requests
import json
import time

# Get required API key related data'Priv
from config import appID, appKey, gkey
# Define base URL
url = 'https://api.schooldigger.com/v1.2/schools'

In [25]:
# Define User Parameters before calling the endpoint above to fetch json school data
st = 'MN'
districtID = '2721240'
level = 'Elementary'
city = 'Minneapolis'
perPage = '50'

school_levels = ['Elementary', 'Middle', 'High', 'Alt', 'Private']
for level in school_levels:
    params = dict(st=st, districtID=districtID, level=level, city=city, perPage=perPage, appID=appID, appKey=appKey)
    res = requests.get(url, params=params)
    json_resp = res.json()
    Wait 2 minutes due to API limitations
    time.sleep(120)
    try:
        with open(f"./data/json_files/jsonFile{level}.json", 'w') as file:
            json.dump(json_resp, file, ensure_ascii=False)
            print(f"Data written to jsonFile{level}.json")            
    except:
        print("Something went wrong with saving file")
    print(json.dumps(json_resp, indent=4, sort_keys=True))

Data written to jsonFilePrivate.json
{
    "_comment": "NOTICE: API limit for Dev/Test is 1 call per minute, up to 20 calls per day. This limit has been reached. You may continue to make calls, but this result is bogus data and should not be used in a production environment. To change your API plan, go to https://developer.schooldigger.com/admin/applications/",
    "numberOfPages": 0,
    "numberOfSchools": 0,
    "schoolList": []
}


In [4]:
# Initialize series used to collect school data for distinct columns
schoolid = []
schoolName = []
latitude = []
longitude = []
latlng = []
year = []
numberOfStudents = []
percentFreeDiscLunch = []
pupilTeacherRatio = []
rankStars = []
rankLevel = []
rankStatewidePercentage = []
averageStandardScore = []
neighborhood = []

In [23]:
for level in school_levels:
    # Load JSON
    jsonFileName = f"./data/json_files/jsonFile{level}.json"
    with open(jsonFileName) as jsonfile:
        json_resp = json.load(jsonfile)
        #print(json.dumps(json_resp, indent=4, sort_keys=True))    
        # Create Json object to extract the relevant data for each school level
        schoolList = json_resp["schoolList"]
        # Figure out the number of schools in that particular school level category
        schoolNum = json_resp["numberOfSchools"]
        print(schoolNum)    
        for i in range (0, schoolNum):
            # Generic school info
            schoolid.append(schoolList[i]["schoolid"])        
            schoolName.append(schoolList[i]["schoolName"])
            latt = schoolList[i]["address"]["latLong"]["latitude"]
            latitude.append(schoolList[i]["address"]["latLong"]["latitude"])
            lont = schoolList[i]["address"]["latLong"]["longitude"]          
            longitude.append(schoolList[i]["address"]["latLong"]["longitude"])
            latlngt = f"{latt}, {lont}"
            latlng.append(latlngt)
            # School details - these fields were present in all datasets
            year.append(schoolList[i]["schoolYearlyDetails"][0]["year"])        
            numberOfStudents.append(schoolList[i]["schoolYearlyDetails"][0]["numberOfStudents"])
            percentFreeDiscLunch.append(schoolList[i]["schoolYearlyDetails"][0]["percentFreeDiscLunch"])
            pupilTeacherRatio.append(schoolList[i]["schoolYearlyDetails"][0]["pupilTeacherRatio"])
            # Rank History data was missing from various datasets - it needs to be handled separarely
            # Look for an empty section for a rankHistory section that is normally a dictionary
            if schoolList[i]["rankHistory"] is None:
                rankStars.append("NA")
                rankLevel.append("NA")
                rankStatewidePercentage.append("NA")
                averageStandardScore.append("NA")
            else:
                rankStars.append(schoolList[i]["rankHistory"][0]["rankStars"])
                rankLevel.append(schoolList[i]["rankHistory"][0]["rankLevel"])
                rankStatewidePercentage.append(schoolList[i]["rankHistory"][0]["rankStatewidePercentage"])
                averageStandardScore.append(schoolList[i]["rankHistory"][0]["averageStandardScore"])

0


In [None]:
print(f"{schoolid} {schoolName} {latitude} {longitude} {latlng} {year} {rankStars} {rankLevel} {rankStatewidePercentage} \
    {averageStandardScore} {numberOfStudents} {percentFreeDiscLunch} {pupilTeacherRatio}")


In [None]:
# Define base URL
url = 'https://maps.googleapis.com/maps/api/geocode/json'url

# User params specific to all calls to this endpoint above
for lat_lng in latlng:
    params = dict(latlng=lat_lng, key=gkey)
    res = requests.get(url, params=params)
    # Delay processing by five seconds due to API provider restrictions on usage
    time.sleep(5)
    json_resp = res.json()
    json_resp_tmp = json_resp['results'][0]['address_components'][2]['long_name']
    neighborhood.append(json_resp_tmp)

In [None]:
print(f"{schoolid} {schoolName} {latitude} {longitude} {latlng} {year} {rankStars} {rankLevel} {rankStatewidePercentage} \
    {averageStandardScore} {numberOfStudents} {percentFreeDiscLunch} {pupilTeacherRatio} {neighborhood}")

In [None]:
neighborhood


In [None]:
# Combine individual series above to create a dataframe consisting of all of them
minneapolis_schools = {'School ID': schoolid, 'School Name': schoolName, 'Latitude Longitude': latlng, 'Student Population': numberOfStudents,
                  'Free Discount Lunch': percentFreeDiscLunch, 'Students Per Teacher': pupilTeacherRatio, 'Rank Stars': rankStars, 
                  'Rank Level': rankLevel, 'Rank Statewide': rankStatewidePercentage, 'Average Standard Score': averageStandardScore,
                  'Neighborhood': neighborhood}
# Combine individual series above to create a dataframe consisting of all of them
minneapolis_schools = {'School ID': schoolid, 'School Name': schoolName, 'Latitude Longitude': latlng, 'Student Population': numberOfStudents,
                  'Free Discount Lunch': percentFreeDiscLunch, 'Students Per Teacher': pupilTeacherRatio, 'Rank Stars': rankStars, 
                  'Rank Level': rankLevel, 'Rank Statewide': rankStatewidePercentage, 'Average Standard Score': averageStandardScore,
                  'Neighborhood': neighborhood}

minneapolis_schools_df = pd.DataFrame(minneapolis_schools)  
minneapolis_schools_df


In [None]:
# Setup CSV file
output_data_file = "data/csv_files/minneapolis_schools.csv"
# Output data to an Excel CSV file
minneapolis_schools_df.to_csv(output_data_file)

This concluded data collection efforts from two API services. Each had specific restriction relating to the 
number of call per minutes and the total daily usage.