In [1]:
import pandas as pd
from nycschools import schools
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from collections import Counter
from itertools import chain


import os

import dotenv
dotenv.load_dotenv()
os.environ.get("CHROME_PATH", None)


Functions
=========

In [5]:
def open_webdriver():

    # don't launch the browser GUI
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')

    # if the environment variables are set, use them
    chrome_path = os.environ.get("CHROME_PATH", None)
    chromedriver_path = os.environ.get("CHROMEDRIVER_PATH", None)
    
    if chrome_path:
        chrome_options.binary_location = chrome_path
    if chromedriver_path:
        print("using driver")
        s = Service(chromedriver_path)
        return webdriver.Chrome(service=s, options=chrome_options)

    return webdriver.Chrome(options=chrome_options)

def fix_cols(data, sections, dbn):


    col_map = {
        'title': 'item',
        'assignment': 'item',
        'organizational category': 'item',
        'total': 'item',
        'total.1': 'item',
        'grand total': 'item',
        'type of class/service': 'service'
    }


    data = data.copy()
    
    for i, section in enumerate(sections):
        data[i]["category"] = section
        data[i].columns = [c.lower() for c in data[i].columns]
        data[i].rename(columns=col_map, inplace=True)
        school_col = [c for c in data[i].columns if c.lower().startswith(dbn.lower())]
        if len(school_col) > 0:
            data[i].rename(columns={school_col[0]: 'item'}, inplace=True)
    return data


def get_budgets(dbn, ay, driver):

    url = f"https://www.nycenet.edu/offices/d_chanc_oper/budget/dbor/galaxy/galaxybudgetsummaryto/default.aspx?DDBSSS_INPUT={dbn[2:]}"

    driver.get(url)
    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')

    sections = [section.get_text().strip() for section in soup.select('.TO_Section')]
    data = pd.read_html(html)
    data = fix_cols(data, sections, dbn)

    for i in range(len(data)):
        data[i] = data[i].loc[:, ~data[i].columns.duplicated()]

    data = pd.concat(data, ignore_index=True)
    data["dbn"] = dbn
    data["ay"] = ay

    return data

def batch_budgets(dbns, ay, driver):
    budgets = []
    not_found = []
    for dbn in dbns:
        try:
            budgets.append(get_budgets(dbn, ay, driver))
        except ValueError:
            not_found.append(dbn)

    data = pd.concat(budgets)
    data.item = data.item.str.lower()

    return data, not_found


Single School Test
==================

In [6]:
driver = open_webdriver()
data = get_budgets("13K001", 2022, driver)
driver.quit()
data


Unnamed: 0,item,positions,budget,category,grade,service,subject,15k001 - ps 001 the bergen school,dbn,ay
0,HEAD OF SCHOOL,1.0,"$ 181,271",Leadership,,,,,13K001,2022
1,ASST HEAD OF SCHOOL,3.0,"$ 351,441",Leadership,,,,,13K001,2022
2,Leadership Total,4.0,"$ 532,712",Leadership,,,,,13K001,2022
3,PARENT COORDINATOR,1.0,"$ 50,520",Coordinator/Supervisor/Dean,,,,,13K001,2022
4,School Secretary,2.0,"$ 126,423",Secretary,,,,,13K001,2022
...,...,...,...,...,...,...,...,...,...,...
102,School Funded Copier,,"$ 8,940",Setasides,,,,,13K001,2022
103,Not Available,,"$ 29,840",Sixth period coverage,,,,,13K001,2022
104,People Working Partial Year,,"$ 143,389",People Working Partial Year,,,,,13K001,2022
105,,93.0,"$ 10,606,323",School Site,,,,Main School,13K001,2022


In [8]:
# cols = [d.columns for d in data]
# cols = list(chain(*cols))
# counts = pd.DataFrame(Counter(cols).items(), columns=['header', 'count'])
# counts.sort_values('count', ascending=False)

cols = []
for d in data:
    if isinstance(d, pd.DataFrame):
        cols.append(d.columns)
cols = list(chain(*cols))
counts = pd.DataFrame(Counter(cols).items(), columns=['header', 'count'])



Batch Budgets
==============

In [9]:
df = schools.load_school_demographics()
df = df[(df.ay == df.ay.max()) & (df.district < 33)]
dbns = df.dbn.values
driver = open_webdriver()
budgets, not_found = batch_budgets(dbns, 2022, driver)

driver.quit()

KeyboardInterrupt: 

In [None]:
budgets.columns
data = budgets.copy()

# update slice of data to set item to the value of "grand total" where grand total is not null
# data.loc[data["grand total"].notnull(), "item"] = data["grand total"]
x = data[data["grand total"].notnull()]
x["item"] = x["grand total"]
data[data["grand total"].notnull()]


In [None]:

# items = data.item.value_counts()
# x = pd.DataFrame(items)
# x.sort_values("item").head(50)
budgets.to_csv("/opt/nycschools/galaxy.csv", index=False)


In [None]:

# cols = [d.columns for d in budgets]
# cols = list(chain(*cols))
# counts = pd.DataFrame(Counter(cols).items(), columns=['column', 'count'])
# counts.sort_values('count', ascending=False)
budgets.columns

In [1]:
# set autoreload
%load_ext autoreload
from nycschools import budgets


In [2]:


data, not_found = budgets.get_galaxy_budgets()
len(data)


Error scraping 15K001


111745

In [3]:
budgets.load_galaxy_budgets()

Unnamed: 0,item,positions,budget,category,subject,service,dbn,ay,grade
0,head of school,1.0,"$ 183,162",Leadership,,,27Q302,2022,
1,ap - special ed,1.0,"$ 144,323",Leadership,,,27Q302,2022,
2,ap - supervision,2.0,"$ 289,124",Leadership,,,27Q302,2022,
3,leadership total,4.0,"$ 616,609",Leadership,,,27Q302,2022,
4,dean,1.0,"$ 98,459",Coordinator/Supervisor/Dean,,,27Q302,2022,
...,...,...,...,...,...,...,...,...,...
3757,office temp services - contractual,,"$ 165,000",OTPS,,,02M150,2022,
3758,supplies - general,,"$ 43,820",OTPS,,,02M150,2022,
3759,textbooks,,"$ 24,599",OTPS,,,02M150,2022,
3760,otps total,,"$ 276,645",OTPS,,,02M150,2022,
