# Todo
* More cleanup
* Need to create a dataset with just basic district into so i can join on things like lea_type

# Notes
* This document has all the budget codes -> https://www.education.pa.gov/Documents/Teachers-Administrators/School%20Finances/Comptrollers%20Office/Chart%20of%20Accounts.pdf

In [58]:
import ssl
import re
import pandas as pd
import numpy as np
#import plotly.express as px
#import plotly.io as pio
import urllib.parse
import os
import requests
import sqlite3 as db


# gets rid of ssl errors
ssl._create_default_https_context = ssl._create_unverified_context

In [59]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re


url = "https://www.education.pa.gov/Teachers%20-%20Administrators/School%20Finances/Finances/GFBData/Pages/default.aspx"
host = re.sub(r'(https*://[^/]+).*','\\1',url)
req = Request(url)
html_page = urlopen(req)

soup = BeautifulSoup(html_page, "html")

links = []
for link in soup.findAll('a'):
    links.append(link.get('href'))

In [60]:
# this filters down the list to just what we are looking for
links = list( filter(lambda l: re.search(r'20\d\d-\d\dGFBData\.xlsx$', str(l) ) , links))

In [61]:
# this will put the host part of the URL back on the beginning of the link
links = list(map( lambda l: host + l, links))

In [62]:
# this function will download a file if local cache doesn't exist and then return local file name
def cache_url(file):

    # set directory base and create if doesn't exist
    directory='../data/raw/finance_expenses/'
    try:
        os.stat(directory)
    except:
        os.makedirs(directory)

    # decode the URL name. then split on /. and grab last item in list which will be the file name
    dec=urllib.parse.unquote( file )
    file_list=dec.split('/')
    len_list=len(file_list)
    cache_file=directory + file_list[len_list-1].lower().replace(' ', '_')

    # see if file exists. if so use that. otherwise download it
    if not os.path.exists( cache_file ):
        print( 'caching file: ' + cache_file )
        file_to_write = requests.get(file)
        with open(cache_file, 'wb') as f:
            f.write(file_to_write.content)
        
    return cache_file

# this will do the download, just pass it a sheet and URL or file name
def download_excel(file,sheet,skiprows):
        # download and read in the sheet
        df = pd.DataFrame()
        
        # download, cache file, and return cached name
        file=cache_url(file)
        
        try:
            df = pd.read_excel(file,
                       sheet_name=sheet,
                       skiprows=skiprows)
        except:
            print( 'Error: not able to download: ' + file )
            
        return df

In [63]:
# builds one giant dataframe
df = pd.DataFrame()
count=0
for l in links: 
    if count < 5:
        # find the start year from the URL
        school_year = re.sub('^https.+(\d\d\d\d)-\d\dGFBData.xlsx$','\\1',l)
        print( school_year + ": " + l )

        # download and read in the sheet
        if int(school_year) >= 2011:
            df1 = download_excel(l,'ExpDetail', 0)
            
        # make long form
        df1 = pd.melt(df1, id_vars=['InstCat', 'AUN', 'InstName', 'CountyName'],
                var_name='expense_category', value_name='expense_value')
            
        # add in school year
        df1['school_year'] = school_year
                
        # concat the newly downloaded df onto the larger one
        df = pd.concat([df, df1],
                       ignore_index=True,)
    count = count + 1

# convert header names to strings
df.columns = df.columns.map(str)

2021: https://www.education.pa.gov/Documents/Teachers-Administrators/School%20Finances/Finances/GFBData/2021-22GFBData.xlsx
2020: https://www.education.pa.gov/Documents/Teachers-Administrators/School%20Finances/Finances/GFBData/2020-21GFBData.xlsx
2019: https://www.education.pa.gov/Documents/Teachers-Administrators/School%20Finances/Finances/GFBData/2019-20GFBData.xlsx
2018: https://www.education.pa.gov/Documents/Teachers-Administrators/School%20Finances/Finances/GFBData/2018-19GFBData.xlsx
2017: https://www.education.pa.gov/Documents/Teachers-Administrators/School%20Finances/Finances/GFBData/2017-18GFBData.xlsx


In [64]:
# Pick better names
df.columns = [column.strip().replace(' ', '_').lower() for column in df.columns]

In [65]:
# read in the expense categories & join with the main dataframe
expense_category = pd.read_csv( '../lookup_data/expense_lookups.csv' )
df = df.join(expense_category.set_index('raw_category'), on='expense_category')

In [66]:
# drop few columns we don't need in the database
drop_columns=['instname', 'countyname', 'instcat']
df.drop(drop_columns, axis = 1,inplace=True)

In [67]:
# save the data out for another script to consume
df.to_pickle("../data/budget_expense.pkl.bz2", compression='infer')

## Below this will be putting summarized tables into sqlite3 database

In [68]:
# Connect to DB file
cnx = db.connect('../data/pde.db')

In [None]:
# create lea_info table -> using most recent information about aun so to normalize throughout years
expense_info = df.sort_values(by = 'school_year', ascending = False)  # sort by school_year newest first. will grab most recent entry
expense_info = expense_info.set_index(['aun', 'school_year', 'expense_category'])
expense_info.to_sql(name='finance_expense', con=cnx, if_exists='replace', index=True)