# Automate the Excel workbook linked to PMP Dashboard

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp import *
from shared_utils import utils

# Formatting the nb
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)



In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/pmp_dashboard/"

## Load in data
### Load manually cleaned df so I can reference 

In [3]:
# DF that William manipulated
FILE_NAME_1 = "PMP Summary Report Data.xlsx"

In [4]:
# Sheets in William's notebook that I want
sheets_list = ["Fund by Division Data", "TPSOE Data", "Timeline Data", "PSOE Timeline"]

In [5]:
dict_df1 = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME_1}", sheet_name=sheets_list)

In [6]:
division_df = to_snakecase(dict_df1.get("Fund by Division Data"))
tpsoe_df = to_snakecase(dict_df1.get("TPSOE Data"))
timeline_df = to_snakecase(dict_df1.get("Timeline Data"))
psoe_df = to_snakecase(dict_df1.get("PSOE Timeline"))

### Load in & clean data for each accounting period

In [7]:
div_crosswalks= {
            "State & Fed Mass Trans": "DRMT",
            "Statewide Planning": "DOTP",
            "Research": "DRISI",
            "PSR/PSSR Development": "DOTP",
            "Rail": "DRMT",
            "Planning Administration": "DOTP",
            "Regional Planning": "DOTP",
        }

In [8]:
'''
Each time I receive a new data for each accounting period and 
#load data into function import_raw_data, the dataframe will be 
#appended onto this list
'''
my_clean_dataframes = []

In [9]:
unwanted_appropriations = ["22105"]
    

In [10]:
def import_raw_data(file_name: str, name_of_sheet: str, appropriations_to_filter: list):
    
    '''
    Name_of_sheet: name of Excel tab that contains data
    Appropriations_to_filter: certain appropriations are filtered out
    but this is subject to change. A list of what to filter allows for 
    flexibility.
    '''
    
    # Read in file
    df = pd.read_excel(f"{GCS_FILE_PATH}{file_name}", sheet_name=name_of_sheet)

    # Get rid of the unnecessary header info
    # Stuff like "Enterprise Datalink Production download as of 05/23/2022"
    df = df.iloc[13:].reset_index(drop=True)

    # The first row contains column names
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Drop rows with NA in the certain cols,
    # Since those are probably the grand totals tagged at the end of the Excel sheet
    df = df.dropna(subset=["Appr Catg", "Appr", "PEC Class", "PEC Class Description"])
    
    # Snakecase
    df = to_snakecase(df)
    
    # Rename columns to mimc dashboard
    df = df.rename(
        columns={
            "pec_class_description": "division",
            "ps_alloc": "ps_allocation",
            "ps_exp": "ps_expenditure",
            "ps_bal": "ps_balance",
            "total_projected_%": "total_%_expended",
            "oe_alloc": "oe_allocation",
            "oe_enc": "oe_encumbrance",
            "oe_exp": "oe_expenditure",
            "appr": "appropriation",
            "total_expended___encumbrance": "total_expenditure",
            "oe_bal_excl_pre_enc": "oe_balance",
            "oe__enc_+_oe_exp_projection": "oe_enc_+_oe_exp_projection",
        }
    )
    
    # Certain appropriation(s) are filtered out:
    df = df[
    ~df.appropriation.isin(appropriations_to_filter)]
    
    # Narrow down division names
    df["division"] = df["division"].replace(div_crosswalks)
    
    #Adding dataframe to an empty list
    my_clean_dataframes.append(df)
    
    return df

In [11]:
ap4 = import_raw_data("AP4.xls", "Download", unwanted_appropriations)

In [12]:
ap7 = import_raw_data("AP7.xls", "Download", unwanted_appropriations)

In [13]:
ap11 = import_raw_data( "FY 2122 AP12_Closed_PMP Summary Report.022822_Updated.xlsx", "Raw Data AP12 Closed", unwanted_appropriations)

In [14]:
len(my_clean_dataframes)

3

## Fund by Division Data
* Attempt to mimic William's processed sheet called "division df"
* This dataframe is AP 11 with some stuff filtered

In [16]:
def fund_by_division_data(df):
    # Drop excluded cols
    excluded_cols = ["appr_catg", "act__hours", "py_pos_alloc"]
    df = ap11.drop(columns=excluded_cols)
    
    # Add a blank column for notes
    df["notes"] = np.nan
    
    return df

In [17]:
# Start of script
# Drop excluded cols
excluded_cols = ["appr_catg", "act__hours", "py_pos_alloc"]
division_test = ap11.drop(columns=excluded_cols)

In [18]:
# Add a blank column for notes
division_test["notes"] = np.nan

In [19]:
# Filter out for the appropriations
unwanted_appropriations = ["1850522", "22102", "22105"]

In [20]:
# filter them all out at once.
division_test = division_test[
    ~division_test.appropriation.isin(unwanted_appropriations)
]

### Double checking: William's vs mine

In [21]:
test = set(division_test.appropriation.unique().tolist())
actual = set(division_df.appropriation.unique().tolist())

In [22]:
division_test.division.unique().tolist()

['Local Assistance', 'DRMT', 'DOTP', 'Aeronautics', 'DRISI']

In [23]:
division_df.division.unique().tolist()

['Aeronautics', 'Local Assistance', 'DRISI', 'DRMT', 'DOTP']

In [24]:
actual - test

set()

In [25]:
test - actual

set()

In [26]:
#division_test[['appropriation','ps_allocation']].sort_values(['appropriation', 'ps_allocation'])

In [27]:
#division_df[['appropriation','ps_allocation']].sort_values(['appropriation', 'ps_allocation'])

In [28]:
division_df.loc[division_df["appropriation"] == "22030"]

Unnamed: 0,pec_class,division,fund,fund_description,appropriation,ps_allocation,ps_expenditure,ps_balance,ps_projection,year_end_expendded_pace,ps_%_expended,oe_allocation,oe_encumbrance,oe_expenditure,oe_balance,oe_enc_+_oe_exp_projection,oe_%_expended,total_allocation,total_expenditure,total_balance,total_projection,total_%_expended,notes
3,2030,Local Assistance,1,General Fund,22030,2625000,1265708.14,1359291.86,1380773.0,0.526009,0.482175,27000,1593.74,10119.4,15286.86,12633.085455,0.467892,2652000,1277421.28,1374578.72,1393406.0,0.481682,
14,2041,DRISI,1,General Fund,22030,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,
19,3010,DRMT,1,General Fund,22030,150000,145312.79,4687.21,158523.0,1.05682,0.968752,2000,0.0,0.0,2000.0,0.0,0.0,152000,145312.79,6687.21,158523.0,0.956005,
28,3020,DRMT,1,General Fund,22030,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,
36,4010,DOTP,1,General Fund,22030,150000,166142.55,-16142.55,181246.4,1.208309,1.107617,2000,0.0,0.0,2000.0,0.0,0.0,152000,166142.55,-14142.55,181246.4,1.093043,


In [29]:
division_test.loc[division_test["appropriation"] == "22030"]

Unnamed: 0,fund,fund_description,appropriation,pec_class,division,ps_allocation,ps_expenditure,ps_balance,ps_projection,year_end_expendded_pace,ps_%_expended,oe_allocation,oe_encumbrance,oe_expenditure,oe_balance,oe_enc_+_oe_exp_projection,oe_%_expended,total_allocation,total_expenditure,total_balance,total_projection,total_%_expended,notes
1,1,General Fund,22030,2030,Local Assistance,2625000,1405981.81,1219018.19,1405981.81,0.535612,0.535612,27000,1593.74,10119.4,15286.86,11713.14,0.43382,2652000,1417694.95,1234305.05,1417694.95,0.534576,
2,1,General Fund,22030,3010,DRMT,150000,149442.4,557.6,149442.4,0.996283,0.996283,2000,0.0,0.0,2000.0,0.0,0.0,152000,149442.4,2557.6,149442.4,0.983174,
3,1,General Fund,22030,4010,DOTP,150000,153219.06,-3219.06,153219.06,1.02146,1.02146,2000,0.0,0.0,2000.0,0.0,0.0,152000,153219.06,-1219.06,153219.06,1.00802,


In [30]:
testing_cols = [
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "total_allocation",
    "total_expenditure",
    "total_balance",
]

In [31]:
for i in testing_cols:
    print("\n" + i)
    print(division_df[i].sum())
    print(division_test[i].sum())


ps_allocation
230928000
230928000

ps_expenditure
183147623.15
201928225.22999996

ps_balance
47780376.85
28999774.77

ps_projection
199797407.0727273
201928225.22999996

oe_allocation
232735000
232735000

oe_encumbrance
104147405.69
170029484.92000005

oe_expenditure
33504730.379999995
38693682.68

total_allocation
463663000
463663000

total_expenditure
320799759.2200001
410651392.8300001

total_balance
142863240.78
53011607.16999999


In [32]:
division_df.info(), division_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   pec_class                   50 non-null     int64  
 1   division                    50 non-null     object 
 2   fund                        50 non-null     int64  
 3   fund_description            50 non-null     object 
 4   appropriation               50 non-null     object 
 5   ps_allocation               50 non-null     int64  
 6   ps_expenditure              50 non-null     float64
 7   ps_balance                  50 non-null     float64
 8   ps_projection               50 non-null     float64
 9   year_end_expendded_pace     50 non-null     float64
 10  ps_%_expended               50 non-null     float64
 11  oe_allocation               50 non-null     int64  
 12  oe_encumbrance              50 non-null     float64
 13  oe_expenditure              50 non-nu

(None, None)

## TPSOE Data

In [33]:
# Reference point
tpsoe_df.loc[tpsoe_df["division"] == "Aeronautics"]

Unnamed: 0,pec_class,division,fund,fund_description,appropriation,type,allocation,expenditure,balance,encumbrance,projection,year_end_expendded_pace,%_expended,notes
0,1000,Aeronautics,41,Aeronautics Account STF,22001,PS,3742000,2668503.64,1073496.36,,2911095.0,0.777952,0.713122,
1,1000,Aeronautics,41,Aeronautics Account STF,22001R,PS,59000,26017.94,32982.06,,28383.21,0.481071,0.440982,
2,1000,Aeronautics,890,Federal Trust Fund,22001F,PS,89000,0.0,89000.0,,0.0,0.0,0.0,
50,1000,Aeronautics,41,Aeronautics Account STF,22001,OE,487000,119766.31,256770.32,110463.37,241117.5,,0.495108,
51,1000,Aeronautics,41,Aeronautics Account STF,22001R,OE,0,13866.74,-13866.74,0.0,15127.35,,0.0,
52,1000,Aeronautics,890,Federal Trust Fund,22001F,OE,370000,3000.0,367000.0,0.0,3272.727,,0.008845,


In [34]:
ap11.loc[ap11["division"] == "Aeronautics"]

Unnamed: 0,appr_catg,fund,fund_description,appropriation,pec_class,division,ps_allocation,ps_expenditure,ps_balance,ps_projection,year_end_expendded_pace,ps_%_expended,py_pos_alloc,act__hours,oe_allocation,oe_encumbrance,oe_expenditure,oe_balance,oe_enc_+_oe_exp_projection,oe_%_expended,total_allocation,total_expenditure,total_balance,total_projection,total_%_expended
4,2122,41,Aeronautics Account STF,22001R,1000,Aeronautics,59000,30208.7,28791.3,30208.7,0.512012,0.512012,0,346.25,0,0.0,17273.38,-17273.38,17273.38,0%,59000,47482.08,11517.92,47482.08,0.804781
5,2122,41,Aeronautics Account STF,22001,1000,Aeronautics,3742000,2929497.33,812502.67,2929497.33,0.782869,0.782869,0,32931.25,487000,110522.21,136086.94,240390.85,246609.15,0.506384,4229000,3176106.48,1052893.52,3176106.48,0.75103
31,2122,890,Federal Trust Fund,22001F,1000,Aeronautics,89000,0.0,89000.0,0.0,0.0,0.0,0,0.0,370000,0.0,3000.0,367000.0,3000.0,0.008108,459000,3000.0,456000.0,3000.0,0.006536


In [35]:
# Cols: for PS
tpsoe_ps_list = [
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "ps_allocation",
    "ps_expenditure",
    "ps_balance",
    "ps_projection",
    "year_end_expendded_pace",
    "ps_%_expended",
]

In [36]:
# Cols for oE
tpsoe_oe_list = [
    "fund",
    "fund_description",
    "appropriation",
    "pec_class",
    "division",
    "oe_allocation",
    "oe_encumbrance",
    "oe_expenditure",
    "oe_balance",
    "oe_enc_+_oe_exp_projection",
]

In [37]:
# Subset df with PS only vars
tpsoe_ps = ap11[tpsoe_ps_list]

In [38]:
# Subset df with PS only vars
tpsoe_oe = ap11[tpsoe_oe_list]

In [39]:
def cleaning_tpsoe(df, ps_or_oe: str):
    # Fill in the column type for either PS: personal services
    # or OE: operating expense
    df["type"] = ps_or_oe

    # Strip away the prefixes from column names
    # https://stackoverflow.com/questions/54097284/removing-suffix-from-dataframe-column-names-python

    # Create suffix
    suffix = f"{ps_or_oe}_"
    df.columns = df.columns.str.replace(suffix, "", regex=True)

    # There is a enc_+_exp_projection for OE: try and except to rename to projection
    # To match PS
    try:
        df = df.rename(columns={"enc_+_exp_projection": "projection"})
    except:
        pass
    
    #Filter out the unwanted appropriations
    df = df[
    ~df.appropriation.isin(unwanted_appropriations)]
    return df

In [40]:
#Apply functions to subsets of OE and PS
tpsoe_oe = cleaning_tpsoe(tpsoe_oe, "oe")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
tpsoe_ps = cleaning_tpsoe(tpsoe_ps, "ps")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Start of the 2nd function

In [42]:
#Concat the dataframe
c1 = pd.concat([tpsoe_ps, tpsoe_oe], sort=False)

In [43]:
order_of_cols = [
    "pec_class",
    "division",
    "fund",
    "fund_description",
    "appropriation",
    "type",
    "allocation",
    "expenditure",
    "balance",
    "encumbrance",
    "projection",
    "year_end_expendded_pace",
    "%_expended",
]

In [44]:
#Reorder df to the right order
c1 = c1[order_of_cols]

In [45]:
#Add a notes column
c1["notes"] = np.nan

In [46]:
#Monetary cols to coerce into floats
monetary_cols = [
    "allocation",
    "expenditure",
    "balance",
    "encumbrance",
    "projection",
    "year_end_expendded_pace",
    "%_expended",
]

In [47]:
c1[monetary_cols] = c1[monetary_cols].astype("float64")

### Double checking

In [48]:
c1.shape

(96, 14)

In [49]:
tpsoe_df.shape

(100, 14)

In [50]:
#c1[['division', 'type','pec_class', 'allocation']].sort_values(['pec_class', 'allocation'])

In [51]:
# tpsoe_df[['division','type','pec_class', 'allocation']].sort_values(['pec_class', 'allocation'])

In [52]:
for i in monetary_cols:
    print("\n" + i)
    print(c1[i].sum())
    print(tpsoe_df[i].sum())


allocation
463663000.0
463663000

expenditure
240621907.91
216652353.52999997

balance
53011607.16999998
142863240.78

encumbrance
170029484.92000002
104147405.69

projection
410607069.0880165
340495427.72272724

year_end_expendded_pace
26.69852036355792
26.708515918639606

%_expended
26.69852036355792
37.17215507087656


In [53]:
test = set(c1.appropriation.unique().tolist())
actual = set(tpsoe_df.appropriation.unique().tolist())

In [54]:
actual - test

set()

In [55]:
test - actual

set()

## Timeline 

In [66]:
#How to assign keys automatically??
#https://stackoverflow.com/questions/59267129/how-to-concatenate-multiple-dataframes-from-multiple-sources-in-pandas
time_test = df = (pd.concat(my_clean_dataframes, keys=(4, 7, 11))
        .rename_axis(('source', 'tmp'))
        .reset_index(level=0)
        .reset_index(drop=True))

In [67]:
time_test['source'].value_counts()

7     59
4     58
11    54
Name: source, dtype: int64

In [72]:
time_test = (time_test
             .drop(columns = ['ap'])
             .rename(columns = {'source':'ap'})
            )
                

KeyError: "['ap'] not found in axis"

In [81]:
time_test.shape

(171, 27)

In [85]:
timeline_df = timeline_df[timeline_df["ap"].isin([4,7,11])]

In [86]:
timeline_df['ap'].value_counts()

4     50
7     50
11    48
Name: ap, dtype: int64

## PSOE Timeline

In [80]:
psoe_df.head(3)

Unnamed: 0,appr_catg,fund,fund_description,appr,division,pec_class,pec_class_description,allocation,expense,balance,projection,%_expended,ap,type,encumbrance
0,2122,41,Aeronautics Account STF,22001,Aeronautics,1000,Aeronautics,0,259497.99,-259497.99,3113975.88,0,1,PS,
1,2122,42,"Highway Account, State, STF",22001R,DOTP,4050,PSR/PSSR Development,0,242438.21,-242438.21,2909258.52,0,1,PS,
2,2122,42,"Highway Account, State, STF",22001,Local Assistance,2030,Local Assistance,0,3496261.2,-3496261.2,41955134.4,0,1,PS,


In [None]:
psoe_ps_cols = ['appr_catg', 'fund', 'fund_description', 'appropriation',
                

In [89]:
ap11.columns

Index(['appr_catg', 'fund', 'fund_description', 'appropriation', 'pec_class',
       'division', 'ps_allocation', 'ps_expenditure', 'ps_balance',
       'ps_projection', 'year_end_expendded_pace', 'ps_%_expended',
       'py_pos_alloc', 'act__hours', 'oe_allocation', 'oe_encumbrance',
       'oe_expenditure', 'oe_balance', 'oe_enc_+_oe_exp_projection',
       'oe_%_expended', 'total_allocation', 'total_expenditure',
       'total_balance', 'total_projection', 'total_%_expended'],
      dtype='object', name=0)