# Automate the Excel workbook linked to PMP Dashboard

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp import *
from shared_utils import utils

# Formatting the nb
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)



In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/pmp_dashboard/"

## Load in data
### Load manually cleaned df so I can reference 

In [None]:
# DF that William manipulated
FILE_NAME_1 = "PMP Summary Report Data.xlsx"

# Sheets in William's notebook that I want
sheets_list = ["Fund by Division Data", "TPSOE Data", "Timeline Data", "PSOE Timeline"]

dict_df1 = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME_1}", sheet_name=sheets_list)

division_df = to_snakecase(dict_df1.get("Fund by Division Data"))
tpsoe_df = to_snakecase(dict_df1.get("TPSOE Data"))
timeline_df = to_snakecase(dict_df1.get("Timeline Data"))
psoe_df = to_snakecase(dict_df1.get("PSOE Timeline"))

### Load in & clean data for each accounting period

In [None]:
div_crosswalks = {
    "State & Fed Mass Trans": "DRMT",
    "Statewide Planning": "DOTP",
    "Research": "DRISI",
    "PSR/PSSR Development": "DOTP",
    "Rail": "DRMT",
    "Planning Administration": "DOTP",
    "Regional Planning": "DOTP",
}

In [None]:
int_cols = [
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "py_pos_alloc",
    "act__hours",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_balance",
]

#percent_cols = ["ps_%_expended", "oe_%_expended", "total_%_expended"]

In [None]:
"""
Each time I receive new data for each accounting period and 
load data into function import_raw_data, the dataframe will be 
appended onto this list
"""
my_clean_dataframes = []

In [None]:
unwanted_appropriations = ["1850522", "22102", "22105"]

In [None]:
def import_raw_data(
    file_name: str,
    name_of_sheet: str,
    appropriations_to_filter: list,
    accounting_period: int,
):

    """Load the raw data and clean it up.

    Args:
        file_name: the Excel workbook
        name_of_sheet: the name of the sheet
        appropriations_to_filter: list of all the appropriations to be filtered out
        ap: enter the accounting period this is

    Returns:
        The cleaned df. Input the results into a list.

    """
    df = pd.read_excel(f"{GCS_FILE_PATH}{file_name}", sheet_name=name_of_sheet)

    # Get rid of the unnecessary header info
    # Stuff like "Enterprise Datalink Production download as of 05/23/2022"
    df = df.iloc[13:].reset_index(drop=True)

    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Drop rows with NA in PEC Class
    # Since those are probably the grand totals tagged at the end of the Excel sheet
    df = df.dropna(subset=["PEC Class"])

    # Snakecase
    df = to_snakecase(df)

    # Rename columns to mimc dashboard
    df = df.rename(
        columns={
            "ps_alloc": "ps_allocation",
            "ps_exp": "ps_expenditure",
            "ps_bal": "ps_balance",
            "total_projected_%": "total_%_expended",
            "oe_alloc": "oe_allocation",
            "oe_enc": "oe_encumbrance",
            "oe_exp": "oe_expenditure",
            "appr": "appropriation",
            "oe_bal_excl_pre_enc": "oe_balance",
        }
    )

    # Certain appropriation(s) are filtered out:
    df = df[~df.appropriation.isin(appropriations_to_filter)]
    
    # Change to the right data type
    df[int_cols] = df[int_cols].astype("int64").fillna(0)
    """
    Create Columns
    Change to assign later
    """
    # Fill in a column with the accounting period
    df["ap"] = accounting_period

    # Create a variable that just captures one instance of the ap,
    # this is used in certain calculations for columns
    ap_variable = df.iloc[0]["ap"]
    
    # Add column of PS Projection
    df["ps_projection"] = (df["ps_expenditure"] / ap_variable) * 12
    # PS % Expended
    df["ps_%_expended"] = (df["ps_expenditure"] / df["ps_allocation"]).fillna(0)
    # Add the column of 'Year End Expended Pace'
    df["year_expended_pace"] = (df["ps_projection"] / df["ps_allocation"]).fillna(0)
    # Create oe__enc_+_oe_exp_projection
    df["oe_projection"] = df["oe_encumbrance"] + df["oe_expenditure"] / (
        ap_variable * 12
    ).astype("int64")
    # Create OE expended
    df["oe_%_expended"] = (df["oe_projection"] / df["oe_allocation"]).fillna(0)

    # Narrow down division names into a new column
    df["division"] = df["pec_class_description"].replace(div_crosswalks)

    # Add in totals
    df["total_allocation"] = df["oe_allocation"] + df["ps_allocation"]
    # Originally called total expended & encumbrance
    df["total_expenditure"] = (
        df["oe_encumbrance"] + df["oe_expenditure"] + df["ps_expenditure"]
    )
    df["total_balance"] = df["ps_balance"] + df["oe_balance"]
    df["total_projection"] = df["ps_projection"] + df["oe_projection"]
    df["total_projected_%"] = (df["total_expenditure"] / df["total_allocation"]).fillna(
        0
    )

    # Adding dataframe to an empty list called my_clean_dataframes
    my_clean_dataframes.append(df)

    return df

In [None]:
ap4 = import_raw_data("AP4 October.xls", "Download", unwanted_appropriations, 4)

In [None]:
ap4.head()

In [None]:
ap11 = import_raw_data(
    "FY 2122 AP11_Closed_PMP Summary Report.022822_Updated.xlsx",
    "Raw Data AP11 Closed",
    unwanted_appropriations,
)

In [None]:
ap11.head()

In [None]:
ap4.dtypes

In [None]:
ap11.info()

In [None]:
ap11.columns

In [None]:
int_cols = [
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "py_pos_alloc",
    "act__hours",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_balance",
    "oe_projection",
    "total_allocation",
    "total_expenditure",
    "total_balance",
    "total_projection",
]

In [None]:
percent_cols = ["ps_%_expended", "oe_%_expended", "total_%_expended"]

In [None]:
ap11[int_cols] = ap11[int_cols].astype("int64")

In [None]:
ap11[percent_cols] = ap11[percent_cols].replace("%", "", regex=True).astype("float")

In [None]:
ap11["year_expended_pace"] = (ap11["ps_projection"] / ap11["ps_allocation"]).fillna(0)

In [None]:
ap11.head()

In [None]:
# ap11.to_csv('./testing.csv')

In [None]:
set(ap4.columns).difference(set(ap11.columns))

In [None]:
# ap7["total_balance"].sum()

In [None]:
# ap4["total_balance"].sum()

In [None]:
# ap12[['fund','appropriation','pec_class','ps_allocation']].sort_values(['ps_allocation',])

In [None]:
len(my_clean_dataframes)

## Fund by Division Data
* Attempt to mimic William's processed sheet called "division df"
* This dataframe is AP 11 with some stuff filtered

In [None]:
def create_fund_by_division(df):
    # Drop excluded cols
    excluded_cols = ["appr_catg", "act__hours", "py_pos_alloc"]
    df = ap11.drop(columns=excluded_cols)

    # Add a blank column for notes
    df["notes"] = np.nan

    return df

In [None]:
division_test = create_fund_by_division(ap11)

### Double checking: William's vs mine

In [None]:
test = set(division_test.appropriation.unique().tolist())
actual = set(division_df.appropriation.unique().tolist())

In [None]:
actual - test

In [None]:
test - actual

In [None]:
# division_test[['appropriation','ps_allocation']].sort_values(['appropriation', 'ps_allocation'])

In [None]:
# division_df[['appropriation','ps_allocation']].sort_values(['appropriation', 'ps_allocation'])

In [None]:
# division_test.loc[division_test["appropriation"] == "22030"]

In [None]:
# division_df.loc[division_df["appropriation"] == "22030"]

In [None]:
testing_cols = [
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "total_allocation",
    "total_expenditure",
    "total_balance",
]

In [None]:
for i in testing_cols:
    print("\n" + i)
    print(division_df[i].sum())
    print(division_test[i].sum())

## TPSOE Data

In [None]:
# Reference point
# tpsoe_df.loc[tpsoe_df["division"] == "Aeronautics"]

In [None]:
# ap11.loc[ap11["division"] == "Aeronautics"]

### Function to clean

In [None]:
def cleaning_psoe_tpsoe(df, ps_or_oe: str):
    """
    Fill in the column type for either ps: personal services
    or oe: operating expense.
    """
    df["type"] = ps_or_oe

    """
    Strip away the prefixes from column names
    https://stackoverflow.com/questions/54097284/removing-suffix-from-dataframe-column-names-python
    Create suffix
    """
    suffix = f"{ps_or_oe}_"
    df.columns = df.columns.str.replace(suffix, "", regex=True)

    return df

### Function to create sheet

In [None]:
# Cols: for PS
tpsoe_ps_list = [
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "year_end_expended_pace",
    "ps_%_expended",
]

# Cols for OE
tpsoe_oe_list = [
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_balance",
    "oe_projection",
]

monetary_cols = [
    "allocation",
    "expenditure",
    "balance",
    "encumbrance",
    "projection",
]

order_of_cols = [
    "pec_class",
    "division",
    "fund",
    "fund_description",
    "appropriation",
    "type",
    "allocation",
    "expenditure",
    "balance",
    "encumbrance",
    "projection",
    "year_expended_pace",
    "%_expended",
]

In [None]:
# chocolate

In [None]:
def create_tpsoe(df, ps_list: list, oe_list: list):
    """
    ps_list: a list of all the ps related columns.
    oe_list: a list of all the oe related columns.
    Use this to subset out the whole dataframe,
    one for personal services, one for operating expenses.
    """

    # Clean up and subset out the dataframe
    tpsoe_oe = cleaning_psoe_tpsoe(df[oe_list], "oe")
    tpsoe_ps = cleaning_psoe_tpsoe(df[ps_list], "ps")

    # Concat the two dataframes together
    c1 = pd.concat([tpsoe_ps, tpsoe_oe], sort=False)

    # Rearrange the columns to the right order
    c1 = c1[order_of_cols]

    # Add a notes column
    c1["notes"] = np.nan

    # Correct data types of monetary columns from objects to float
    c1[monetary_cols] = c1[monetary_cols].astype("float64")

    return c1

In [None]:
c1 = create_tpsoe(ap11, tpsoe_ps_list, tpsoe_oe_list)

### Double checking

In [None]:
c1.shape

In [None]:
tpsoe_df.shape

In [None]:
# c1[['division', 'type','pec_class', 'allocation']].sort_values(['pec_class', 'allocation'])

In [None]:
# tpsoe_df[['division','type','pec_class', 'allocation']].sort_values(['pec_class', 'allocation'])

In [None]:
for i in monetary_cols:
    print("\n" + i)
    print(c1[i].sum())
    print(tpsoe_df[i].sum())

In [None]:
test = set(c1.appropriation.unique().tolist())
actual = set(tpsoe_df.appropriation.unique().tolist())

In [None]:
actual - test

In [None]:
test - actual

In [None]:
test = set(c1.pec_class.unique().tolist())
actual = set(tpsoe_df.pec_class.unique().tolist())

In [None]:
actual - test

In [None]:
test - actual

## Timeline 

In [None]:
def create_timeline():
    """
    Loop through all the cleaned dfs in the list
    my_clean_dataframes. Tag each dataframe as 1,
    2,3,etc to fill in the column "ap" (aka accounting
    period) that differentiates which accounting period each df comes from.
    """
    keys_list = []

    for i, item in enumerate(my_clean_dataframes):
        keys_list.append(i + 1)

    """
    Stack all the dfs in my_clean_dataframes, starting 
    https://stackoverflow.com/questions/59267129/how-to-concatenate-multiple-dataframes-from-multiple-sources-in-pandas
    """
    c1 = (
        pd.concat(my_clean_dataframes, keys=keys_list)
        .rename_axis(("source", "tmp"))
        .reset_index(level=0)
        .reset_index(drop=True)
    )

    # Drop original accounting period column & replace the new source col
    c1 = c1.drop(
        columns=[
            "ap",
        ]
    ).rename(columns={"source": "ap"})
    return c1

In [None]:
keys_list = []

In [None]:
for i, item in enumerate(my_clean_dataframes):
    keys_list.append(i + 1)

In [None]:
keys_list

In [None]:
# https://stackoverflow.com/questions/59267129/how-to-concatenate-multiple-dataframes-from-multiple-sources-in-pandas
time_test = (
    pd.concat(my_clean_dataframes, keys=keys_list)
    .rename_axis(("source", "tmp"))
    .reset_index(level=0)
    .reset_index(drop=True)
)

In [None]:
time_test = time_test.drop(
    columns=[
        "ap",
    ]
).rename(columns={"source": "ap"})

In [None]:
# Certain appropriation(s) are filtered out:
# time_test = time_test[~time_test.appropriation.isin(unwanted_appropriations)]

In [None]:
# time_test.loc[time_test['appropriation'] == '22030']

In [None]:
timeline_df = timeline_df[timeline_df["ap"].isin([4, 7, 11])]

In [None]:
timeline_df["ap"].value_counts()

In [None]:
timeline_df_cols = [
    "ps_alloc",
    "ps_exp",
    "ps_bal",
    "ps_projection",
    "oe_alloc",
    "oe_enc",
    "oe_exp",
    "oe_projection",
]

In [None]:
for i in timeline_df_cols:
    print("\n" + i)
    print(timeline_df[i].sum())

In [None]:
timeline_test_cols = [
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_projection",
]

In [None]:
for i in timeline_test_cols:
    print("\n" + i)
    print(time_test[i].sum())

## PSOE Timeline

In [None]:
psoe_ps_cols = [
    "appr_catg",
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "ps_%_expended",
    "ap",
    "pec_class_description",
]

psoe_oe_cols = [
    "appr_catg",
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_balance",
    "oe_projection",
    "oe_%_expended",
    "ap",
    "pec_class_description",
]

psoe_right_col_order = [
    "appr_catg",
    "fund",
    "fund_description",
    "appropriation",
    "division",
    "pec_class",
    "pec_class_description",
    "allocation",
    "expense",
    "balance",
    "projection",
    "%_expended",
    "ap",
    "type",
    "encumbrance",
]

In [None]:
def create_psoe_timeline(df, ps_list: list, oe_list: list):

    # Create 2 dataframes that subsets out OE and PS
    psoe_oe = cleaning_psoe_tpsoe(df[oe_list], "oe")
    psoe_ps = cleaning_psoe_tpsoe(df[ps_list], "ps")

    # Stack both dataframes on top of each other
    c1 = pd.concat([psoe_ps, psoe_oe], sort=False)

    # Rename column to mimic William's
    c1 = c1.rename(columns={"expenditure": "expense"})

    # Rearrange the dataframe in the right order
    c1 = c1[psoe_right_col_order]

    return c1

In [None]:
pose_test = create_psoe_timeline(time_test, psoe_ps_cols, psoe_oe_cols)

In [None]:
pose_test = pose_test.loc[pose_test["appropriation"] != "22102F"]

In [None]:
pose_test.shape

In [None]:
pose_test.head(2)

In [None]:
# time_test[['ap','appr_catg','appropriation','pec_class_description','oe_allocation','oe_projection','ap']].loc[time_test["pec_class_description"] == "Aeronautics"]

In [None]:
# test_ps.loc[test_ps["pec_class_description"] == "Aeronautics"]

### Double checking

In [None]:
psoe_df = psoe_df[psoe_df["ap"].isin([4, 7, 11])]

In [None]:
aero_og = psoe_df.loc[psoe_df["division"] == "Aeronautics"]

In [None]:
aero_og

In [None]:
aero_test = pose_test.loc[pose_test["division"] == "Aeronautics"]

In [None]:
aero_test

In [None]:
# aero_test

In [None]:
test = set(pose_test.appropriation.unique().tolist())
actual = set(psoe_df.appr.unique().tolist())

In [None]:
test - actual

In [None]:
actual - test

In [None]:
cols_to_check = ["expense", "balance", "projection", "encumbrance"]

In [None]:
for i in cols_to_check:
    print("\n" + i)
    print(aero_test[i].sum())
    print(aero_og[i].sum())