# Automate the Excel workbook linked to PMP Dashboard

In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp import *
from shared_utils import utils

# Formatting the nb
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/pmp_dashboard/"

## Load in data
### Load manually cleaned df so I can reference 

In [None]:
# DF that William manipulated
FILE_NAME_1 = "PMP Summary Report Data.xlsx"

In [None]:
# Sheets in William's notebook that I want
sheets_list = ["Fund by Division Data", "TPSOE Data", "Timeline Data", "PSOE Timeline"]

In [None]:
dict_df1 = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME_1}", sheet_name=sheets_list)

In [None]:
division_df = to_snakecase(dict_df1.get("Fund by Division Data"))
tpsoe_df = to_snakecase(dict_df1.get("TPSOE Data"))
timeline_df = to_snakecase(dict_df1.get("Timeline Data"))
psoe_df = to_snakecase(dict_df1.get("PSOE Timeline"))

### Load in raw data for each accounting period

In [None]:
def import_raw_data(file_name: str, name_of_sheet: str):
    # Read in file
    df = pd.read_excel(f"{GCS_FILE_PATH}{file_name}", sheet_name=name_of_sheet)

    # Get rid of the unnecessary headers
    # Stuff like "Enterprise Datalink Production download as of 05/23/2022"
    df = df.iloc[13:].reset_index(drop=True)

    # Set column names to the first row
    df.columns = df.iloc[0]

    # Drop the first row
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Drop rows with NA in the certain cols,
    # Since those are probably the grand totals
    df = df.dropna(subset=["Appr Catg", "Appr", "PEC Class", "PEC Class Description"])
    # Snakecase
    df = to_snakecase(df)

    # Rename columns to what William named them as
    df = df.rename(
        columns={
            "pec_class_description": "division",
            "ps_alloc": "ps_allocation",
            "ps_exp": "ps_expenditure",
            "ps_bal": "ps_balance",
            "total_projected_%": "total_%_expended",
            "oe_alloc": "oe_allocation",
            "oe_enc": "oe_encumbrance",
            "oe_exp": "oe_expenditure",
            "appr": "appropriation",
            "total_expended___encumbrance": "total_expenditure",
            "oe_bal_excl_pre_enc": "oe_balance",
            "oe__enc_+_oe_exp_projection": "oe_enc_+_oe_exp_projection",
        }
    )
    return df

In [None]:
# How to do this more efficiently...
ap1 = import_raw_data("AP1.xls", "Download")
ap2 = import_raw_data("AP2.xls", "Download")
ap3 = import_raw_data("AP3.xls", "Download")
ap4 = import_raw_data("AP4.xls", "Download")
ap5 = import_raw_data("AP5.xls", "Download")
ap6 = import_raw_data("AP6.xls", "Download")
ap7 = import_raw_data("AP7.xls", "Download")
ap8 = import_raw_data("AP8.xls", "Download")
ap9 = import_raw_data("AP9.xls", "Download")

In [None]:
ap10 = import_raw_data(
    "FY 2122 AP10_Closed_PMP Summary Report.022822_Andrew Updated.xlsx",
    "Raw Data AP10 Closed",
)
ap11 = import_raw_data(
    "FY 2122 AP11_Closed_PMP Summary Report.022822_Updated.xlsx", "Raw Data AP11 Closed"
)
ap12 = import_raw_data(
    "FY 2122 AP12_Closed_PMP Summary Report.022822_Updated.xlsx", "Raw Data AP12 Closed"
)

## Division DF 
* Attempt to mimic William's processed sheet called "division df"
* This dataframe is just the most AP 11 (I think) with some stuff filtered

In [None]:
# Drop excluded cols
excluded_cols = ["appr_catg", "act__hours", "py_pos_alloc"]
division_test = ap11.drop(columns=excluded_cols)

In [None]:
# Add a blank column for notes
division_test["notes"] = np.nan

In [None]:
# Filter out for the appropriations
unwanted_appropriations = ["1850522", "22102", "22105"]

In [None]:
# filter them all out at once.
division_test = division_test[
    ~division_test.appropriation.isin(unwanted_appropriations)
]

In [None]:
test = set(division_test.appropriation.unique().tolist())
actual = set(division_df.appropriation.unique().tolist())

In [None]:
actual - test

In [None]:
test - actual

In [None]:
division_test.shape

In [None]:
assert set(division_df.column) == set(division_test.columns)

In [None]:
testing_cols = [
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "total_allocation",
    "total_expenditure",
    "total_balance",
]

In [None]:
for i in testing_cols:
    print(division_df[i].sum())

In [None]:
for i in testing_cols:
    print(division_test[i].sum())