# Automate the Excel workbook linked to PMP Dashboard

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp import *
from shared_utils import utils

# Formatting the nb
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)



In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/pmp_dashboard/"

## Load in data
### Load manually cleaned df so I can reference 

In [3]:
# DF that William manipulated
FILE_NAME_1 = "PMP Summary Report Data.xlsx"

In [4]:
# Sheets in William's notebook that I want
sheets_list = ["Fund by Division Data", "TPSOE Data", "Timeline Data", "PSOE Timeline"]

In [5]:
dict_df1 = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME_1}", sheet_name=sheets_list)

In [6]:
division_df = to_snakecase(dict_df1.get("Fund by Division Data"))
tpsoe_df = to_snakecase(dict_df1.get("TPSOE Data"))
timeline_df = to_snakecase(dict_df1.get("Timeline Data"))
psoe_df = to_snakecase(dict_df1.get("PSOE Timeline"))

### Load in & clean data for each accounting period

In [7]:
div_crosswalks = {
    "State & Fed Mass Trans": "DRMT",
    "Statewide Planning": "DOTP",
    "Research": "DRISI",
    "PSR/PSSR Development": "DOTP",
    "Rail": "DRMT",
    "Planning Administration": "DOTP",
    "Regional Planning": "DOTP",
}

In [8]:
"""
Each time I receive new data for each accounting period and 
load data into function import_raw_data, the dataframe will be 
appended onto this list
"""
my_clean_dataframes = []

In [9]:
unwanted_appropriations = ["1850522", "22102", "22105"]

In [10]:
def import_raw_data(file_name: str, name_of_sheet: str, appropriations_to_filter: list):

    """
    Name_of_sheet: name of Excel tab that contains data
    Appropriations_to_filter: certain appropriations are filtered out
    but this is subject to change. A list of what to filter allows for
    flexibility.
    """
    df = pd.read_excel(f"{GCS_FILE_PATH}{file_name}", sheet_name=name_of_sheet)

    # Get rid of the unnecessary header info
    # Stuff like "Enterprise Datalink Production download as of 05/23/2022"
    df = df.iloc[13:].reset_index(drop=True)

    # The first row contains column names
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Drop rows with NA in the certain cols,
    # Since those are probably the grand totals tagged at the end of the Excel sheet
    df = df.dropna(subset=["PEC Class"])

    # Snakecase
    df = to_snakecase(df)

    # Rename columns to mimc dashboard
    df = df.rename(
        columns={
            "ps_alloc": "ps_allocation",
            "ps_exp": "ps_expenditure",
            "ps_bal": "ps_balance",
            "total_projected_%": "total_%_expended",
            "oe_alloc": "oe_allocation",
            "oe_enc": "oe_encumbrance",
            "oe_exp": "oe_expenditure",
            "appr": "appropriation",
            "total_expended___encumbrance": "total_expenditure",
            "oe_bal_excl_pre_enc": "oe_balance",
            "oe__enc_+_oe_exp_projection": "oe_enc_+_oe_exp_projection",
        }
    )

    # Certain appropriation(s) are filtered out:
    df = df[~df.appropriation.isin(appropriations_to_filter)]

    # Narrow down division names
    df["division"] = df["pec_class_description"].replace(div_crosswalks)

    # Adding dataframe to an empty list
    my_clean_dataframes.append(df)

    return df

In [11]:
ap4 = import_raw_data("AP4.xls", "Download", unwanted_appropriations)

In [12]:
ap7 = import_raw_data("AP7.xls", "Download", unwanted_appropriations)

In [13]:
ap11 = import_raw_data(
    "FY 2122 AP11_Closed_PMP Summary Report.022822_Updated.xlsx",
    "Raw Data AP11 Closed",
    unwanted_appropriations,
)

In [14]:
ap7.head(2)

Unnamed: 0,appr_catg,fund,fund_description,appropriation,pec_class,pec_class_description,ps_allocation,ps_expenditure,ps_balance,ps_projection,ps_%_expended,py_pos_alloc,act__hours,oe_allocation,oe_encumbrance,oe_expenditure,oe_balance,oe_projection,oe_%_expended,total_allocation,total_expenditure,total_balance,total_projection,total_%_expended,ap,division
0,2122,1,General Fund,22002,2030,Local Assistance,1500000,0.0,1500000.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0%,1500000,0.0,1500000.0,0.0,0.0,7,Local Assistance
1,2122,1,General Fund,22030,2030,Local Assistance,4001000,448244.24,3552755.76,768418.697143,0.112033,0,4746,237000,9857.31,152.84,226989.85,262.011429,0.000645,4238000,458254.39,3779745.61,768680.708571,0.10813,7,Local Assistance


In [15]:
ap7["total_balance"].sum()

43522270.84000003

In [16]:
ap4["total_balance"].sum()

196907374.42000002

In [17]:
ap4["appropriation"].unique()

array(['22002', '22030', '22001R', '22001', '22008', '22102F', '22001F',
       '22004'], dtype=object)

In [18]:
ap7["appropriation"].unique()

array(['22002', '22030', '22001R', '22001', '22008', '22102F', '22001F',
       '22004'], dtype=object)

In [19]:
# ap12[['fund','appropriation','pec_class','ps_allocation']].sort_values(['ps_allocation',])

In [20]:
len(my_clean_dataframes)

3

## Fund by Division Data
* Attempt to mimic William's processed sheet called "division df"
* This dataframe is AP 11 with some stuff filtered

In [21]:
def create_fund_by_division(df):
    # Drop excluded cols
    excluded_cols = ["appr_catg", "act__hours", "py_pos_alloc"]
    df = ap11.drop(columns=excluded_cols)

    # Add a blank column for notes
    df["notes"] = np.nan

    return df

In [22]:
# Start of script
# Drop excluded cols
excluded_cols = ["appr_catg", "act__hours", "py_pos_alloc"]
division_test = ap11.drop(columns=excluded_cols)

In [23]:
# Add a blank column for notes
division_test["notes"] = np.nan

### Double checking: William's vs mine

In [24]:
test = set(division_test.appropriation.unique().tolist())
actual = set(division_df.appropriation.unique().tolist())

In [25]:
actual - test

set()

In [26]:
test - actual

set()

In [27]:
# division_test[['appropriation','ps_allocation']].sort_values(['appropriation', 'ps_allocation'])

In [28]:
# division_df[['appropriation','ps_allocation']].sort_values(['appropriation', 'ps_allocation'])

In [29]:
division_df.loc[division_df["appropriation"] == "22030"]

Unnamed: 0,pec_class,division,fund,fund_description,appropriation,ps_allocation,ps_expenditure,ps_balance,ps_projection,year_end_expendded_pace,ps_%_expended,oe_allocation,oe_encumbrance,oe_expenditure,oe_balance,oe_enc_+_oe_exp_projection,oe_%_expended,total_allocation,total_expenditure,total_balance,total_projection,total_%_expended,notes
3,2030,Local Assistance,1,General Fund,22030,2625000,1265708.14,1359291.86,1380773.0,0.526009,0.482175,27000,1593.74,10119.4,15286.86,12633.085455,0.467892,2652000,1277421.28,1374578.72,1393406.0,0.481682,
14,2041,DRISI,1,General Fund,22030,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,
19,3010,DRMT,1,General Fund,22030,150000,145312.79,4687.21,158523.0,1.05682,0.968752,2000,0.0,0.0,2000.0,0.0,0.0,152000,145312.79,6687.21,158523.0,0.956005,
28,3020,DRMT,1,General Fund,22030,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,
36,4010,DOTP,1,General Fund,22030,150000,166142.55,-16142.55,181246.4,1.208309,1.107617,2000,0.0,0.0,2000.0,0.0,0.0,152000,166142.55,-14142.55,181246.4,1.093043,


In [30]:
division_test.loc[division_test["appropriation"] == "22030"]

Unnamed: 0,fund,fund_description,appropriation,pec_class,pec_class_description,ps_allocation,ps_expenditure,ps_balance,ps_projection,year_end_expendded_pace,ps_%_expended,oe_allocation,oe_encumbrance,oe_expenditure,oe_balance,oe_enc_+_oe_exp_projection,oe_%_expended,total_allocation,total_expenditure,total_balance,total_projection,total_%_expended,division,notes
1,1,General Fund,22030,2030,Local Assistance,2625000,1265708.14,1359291.86,1380772.516364,0.526009,0.482175,27000,1593.74,10119.4,15286.86,12633.085455,0.467892,2652000,1277421.28,1374578.72,1393405.601818,0.481682,Local Assistance,
2,1,General Fund,22030,3010,State & Fed Mass Trans,150000,145312.79,4687.21,158523.043636,1.05682,0.968752,2000,0.0,0.0,2000.0,0.0,0.0,152000,145312.79,6687.21,158523.043636,0.956005,DRMT,
3,1,General Fund,22030,4010,Statewide Planning,150000,166142.55,-16142.55,181246.418182,1.208309,1.107617,2000,0.0,0.0,2000.0,0.0,0.0,152000,166142.55,-14142.55,181246.418182,1.093043,DOTP,


In [31]:
testing_cols = [
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "total_allocation",
    "total_expenditure",
    "total_balance",
]

In [32]:
for i in testing_cols:
    print("\n" + i)
    print(division_df[i].sum())
    print(division_test[i].sum())


ps_allocation
230928000
230928000

ps_expenditure
183147623.15
183147623.15000004

ps_balance
47780376.85
47780376.85

ps_projection
199797407.0727273
199797407.07272723

oe_allocation
232735000
232735000

oe_encumbrance
104147405.69
104147405.69000001

oe_expenditure
33504730.379999995
33504730.38

total_allocation
463663000
463663000

total_expenditure
320799759.2200001
320799759.22

total_balance
142863240.78
142863240.77999997


In [33]:
division_df.info(), division_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   pec_class                   50 non-null     int64  
 1   division                    50 non-null     object 
 2   fund                        50 non-null     int64  
 3   fund_description            50 non-null     object 
 4   appropriation               50 non-null     object 
 5   ps_allocation               50 non-null     int64  
 6   ps_expenditure              50 non-null     float64
 7   ps_balance                  50 non-null     float64
 8   ps_projection               50 non-null     float64
 9   year_end_expendded_pace     50 non-null     float64
 10  ps_%_expended               50 non-null     float64
 11  oe_allocation               50 non-null     int64  
 12  oe_encumbrance              50 non-null     float64
 13  oe_expenditure              50 non-nu

(None, None)

## TPSOE Data

In [34]:
# Reference point
# tpsoe_df.loc[tpsoe_df["division"] == "Aeronautics"]

In [35]:
# ap11.loc[ap11["division"] == "Aeronautics"]

In [36]:
# Cols: for PS
tpsoe_ps_list = [
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "year_end_expendded_pace",
    "ps_%_expended",
]

In [37]:
# Cols for OE
tpsoe_oe_list = [
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_balance",
    "oe_enc_+_oe_exp_projection",
]

### Function to clean

In [38]:
def cleaning_psoe_tpsoe(df, ps_or_oe: str):
    # Fill in the column type for either PS: personal services
    # or OE: operating expense
    df["type"] = ps_or_oe

    # Strip away the prefixes from column names
    # https://stackoverflow.com/questions/54097284/removing-suffix-from-dataframe-column-names-python

    # Create suffix
    suffix = f"{ps_or_oe}_"
    df.columns = df.columns.str.replace(suffix, "", regex=True)

    # There is a enc_+_exp_projection for OE: try and except to rename to projection
    # To match PS
    try:
        df = df.rename(columns={"enc_+_exp_projection": "projection"})
    except:
        pass

    return df

### Function to create sheet

In [42]:
def create_tpsoe(df, ps_list: list, oe_list: list):
    tpsoe_oe = cleaning_psoe_tpsoe(df[oe_list], "oe")
    tpsoe_ps = cleaning_psoe_tpsoe(df[ps_list], "ps")
    c1 = pd.concat([tpsoe_ps, tpsoe_oe], sort=False)
    order_of_cols = [
        "pec_class",
        "division",
        "fund",
        "fund_description",
        "appropriation",
        "type",
        "allocation",
        "expenditure",
        "balance",
        "encumbrance",
        "projection",
        "year_end_expendded_pace",
        "%_expended",
    ]
    c1 = c1[order_of_cols]

    # Add a notes column
    c1["notes"] = np.nan

    # Monetary cols to coerce into floats
    monetary_cols = [
        "allocation",
        "expenditure",
        "balance",
        "encumbrance",
        "projection",
        "year_end_expendded_pace",
        "%_expended",
    ]

    c1[monetary_cols] = c1[monetary_cols].astype("float64")

    return c1

In [43]:
c1 = create_tpsoe(ap11, tpsoe_ps_list, tpsoe_oe_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [49]:
monetary_cols = [
    "allocation",
    "expenditure",
    "balance",
    "encumbrance",
    "projection",
    "year_end_expendded_pace",
    "%_expended",
]

### Double checking

In [50]:
c1.shape

(96, 14)

In [51]:
tpsoe_df.shape

(100, 14)

In [52]:
# c1[['division', 'type','pec_class', 'allocation']].sort_values(['pec_class', 'allocation'])

In [53]:
# tpsoe_df[['division','type','pec_class', 'allocation']].sort_values(['pec_class', 'allocation'])

In [54]:
for i in monetary_cols:
    print("\n" + i)
    print(c1[i].sum())
    print(tpsoe_df[i].sum())


allocation
463663000.0
463663000

expenditure
216652353.53000003
216652353.52999997

balance
142863240.78
142863240.78

encumbrance
104147405.69
104147405.69

projection
340495427.72272724
340495427.72272724

year_end_expendded_pace
26.708515918639606
26.708515918639606

%_expended
24.482806258752973
37.17215507087656


In [55]:
test = set(c1.appropriation.unique().tolist())
actual = set(tpsoe_df.appropriation.unique().tolist())

In [56]:
actual - test

set()

In [57]:
test - actual

set()

## Timeline 

In [58]:
test_tuple = (4, 7, 11)

In [59]:
# How to assign keys automatically??
# https://stackoverflow.com/questions/59267129/how-to-concatenate-multiple-dataframes-from-multiple-sources-in-pandas
time_test = (
    pd.concat(my_clean_dataframes, keys=test_tuple)
    .rename_axis(("source", "tmp"))
    .reset_index(level=0)
    .reset_index(drop=True)
)

In [60]:
time_test = time_test.drop(
    columns=[
        "ap",
    ]
).rename(columns={"source": "ap"})

In [61]:
# Certain appropriation(s) are filtered out:
# time_test = time_test[~time_test.appropriation.isin(unwanted_appropriations)]

In [62]:
# time_test.loc[time_test['appropriation'] == '22030']

In [63]:
time_test["ap"].value_counts()

7     52
4     51
11    48
Name: ap, dtype: int64

In [64]:
timeline_df = timeline_df[timeline_df["ap"].isin([4, 7, 11])]

In [65]:
timeline_df["ap"].value_counts()

4     50
7     50
11    48
Name: ap, dtype: int64

In [66]:
timeline_df.shape

(148, 26)

In [67]:
time_test["ps_allocation"].sum()

694532000

In [68]:
time_test["total_balance"].sum()

383292886.0400002

In [69]:
timeline_df["ps_alloc"].sum()

694532000

In [70]:
timeline_df["total_balance"].sum()

628609959.72

In [71]:
time_test.columns

Index(['ap', 'appr_catg', 'fund', 'fund_description', 'appropriation',
       'pec_class', 'pec_class_description', 'ps_allocation', 'ps_expenditure',
       'ps_balance', 'ps_projection', 'ps_%_expended', 'py_pos_alloc',
       'act__hours', 'oe_allocation', 'oe_encumbrance', 'oe_expenditure',
       'oe_balance', 'oe_projection', 'oe_%_expended', 'total_allocation',
       'total_expenditure', 'total_balance', 'total_projection',
       'total_%_expended', 'division', 'year_end_expendded_pace',
       'oe_enc_+_oe_exp_projection'],
      dtype='object', name=0)

## PSOE Timeline

In [72]:
psoe_ps_cols = [
    "appr_catg",
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "ps_%_expended",
    "ap",
    "pec_class_description",
]

In [73]:
psoe_os_cols = [
    "appr_catg",
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_balance",
    "oe_enc_+_oe_exp_projection",
    "oe_%_expended",
    "ap",
    "pec_class_description",
]

In [77]:
def create_psoe_timeline(df, ps_list: list, oe_list: list):
    psoe_oe = cleaning_psoe_tpsoe(df[oe_list], "oe")
    psoe_ps = cleaning_psoe_tpsoe(df[ps_list], "ps")

    c1 = pd.concat([psoe_ps, psoe_oe], sort=False)

    c1 = c1.rename(columns={"expenditure": "expense"})
    psoe_right_col_order = [
        "appr_catg",
        "fund",
        "fund_description",
        "appropriation",
        "division",
        "pec_class",
        "pec_class_description",
        "allocation",
        "expense",
        "balance",
        "projection",
        "%_expended",
        "ap",
        "type",
        "encumbrance",
    ]
    c1 = c1[psoe_right_col_order]

    # Add a notes column
    c1["notes"] = np.nan

    return c1

In [81]:
pose_test = create_psoe_timeline(time_test, psoe_ps_cols, psoe_os_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [82]:
pose_test = pose_test.loc[pose_test["appropriation"] != "22102F"]

In [83]:
pose_test.shape

(296, 16)

### Double checking

In [84]:
psoe_df = psoe_df[psoe_df["ap"].isin([4, 7, 11])]

In [85]:
psoe_df.shape

(296, 15)

In [86]:
psoe_df.columns

Index(['appr_catg', 'fund', 'fund_description', 'appr', 'division',
       'pec_class', 'pec_class_description', 'allocation', 'expense',
       'balance', 'projection', '%_expended', 'ap', 'type', 'encumbrance'],
      dtype='object')

In [87]:
psoe_df.head(3)

Unnamed: 0,appr_catg,fund,fund_description,appr,division,pec_class,pec_class_description,allocation,expense,balance,projection,%_expended,ap,type,encumbrance
127,2122,1,General Fund,22002,Local Assistance,2030,Local Assistance,1500000,0.0,1500000.0,0.0,0.0,4,PS,
128,2122,1,General Fund,22030,Local Assistance,2030,Local Assistance,4001000,135511.93,3865488.07,406535.79,0.03387,4,PS,
129,2122,1,General Fund,22030,DRISI,2041,Research,210000,0.0,210000.0,0.0,0.0,4,PS,


In [88]:
aero_og = psoe_df.loc[psoe_df["division"] == "Aeronautics"]

In [89]:
aero_og.shape

(18, 15)

In [90]:
aero_test = pose_test.loc[pose_test["division"] == "Aeronautics"]

In [91]:
aero_test.shape

(18, 16)

In [92]:
test = set(pose_test.appropriation.unique().tolist())
actual = set(psoe_df.appr.unique().tolist())

In [93]:
test - actual

set()

In [94]:
actual - test

set()

In [95]:
cols_to_check = ["expense", "balance", "projection", "encumbrance"]

In [96]:
for i in cols_to_check:
    print("\n" + i)
    print(psoe_df[i].sum())
    print(pose_test[i].sum())


expense
426608887.54
426608887.5399998

balance
628609959.72
628609959.72

projection
801899611.3598702
731347906.9298706

encumbrance
340648152.74
340648152.74000007
