# Expenses in Highlands Sheet are incorrect
* Using expenses from Data link to fill in expenditures for March 25 presentation


In [1]:
import numpy as np
import pandas as pd
import TIRCP_functions
from siuba import *
from calitp import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"

pd.options.display.max_columns = 50
pd.options.display.max_rows = 120
pd.set_option('display.max_colwidth', None)

E0314 21:34:51.905357170    1933 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0314 21:34:52.460633708    1933 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


## Expenditure information from the Enterprise Data Link data Linda pulled.

In [2]:
enterprise = "Enterprise.xlsx"
enterprise_df = pd.read_excel(f"{GCS_FILE_PATH}{enterprise}")
enterprise_df = to_snakecase(enterprise_df)

In [69]:
enterprise_df.head(2)

Unnamed: 0,fy,cycle,project_id,project_name,tot_exp
0,2016,One,16000007,Refurbishment of Seven Light Rail Vehicles,2182302.44
1,2016,One,16000008,South Bay Bus Rapid Transit (BRT) Project,30770.46


In [4]:
enterprise_df = enterprise_df.rename(columns = {'project':'project_id'})

In [5]:
#keep only columns necessary.
enterprise_df = enterprise_df[['fy','cycle','project_id','project_name','tot_exp']]

In [6]:
#Looking at IDS by cycles.
enterprise_df.cycle.value_counts()

Three    152
One       47
Two       47
Four       8
Name: cycle, dtype: int64

In [7]:
enterprise_df.shape

(254, 5)

#### Aggregate by project ID and each cycle, so each project has only one record for total expenses

In [8]:
enterprise_df_project_id = enterprise_df.groupby(['project_id','cycle']).agg({'tot_exp':'sum'}).reset_index()

In [9]:
enterprise_df_project_id.shape

(106, 3)

In [10]:
enterprise_df_project_id.head(2)

Unnamed: 0,project_id,cycle,tot_exp
0,16000007,One,6315208.89
1,16000008,One,4000000.0


## Merge in with allocation sheet of Excel workbook 
* There are 171 unique project IDS in our allocation sheet compared to 106 project IDS in Data Link....
* Lots of missing Project IDS: 107 out of 307 rows of data.

In [11]:
#read in allocation df because that's the sheet with project ids. 
allocation = TIRCP_functions.allocation()

In [12]:
allocation.shape

(215, 50)

In [13]:
allocation.isna().sum()

Award_Year                                  2
Project_#                                   2
Grant_Recipient                             2
Implementing_Agency                         2
Project_ID                                  7
EA                                         10
Components                                  2
Phase                                       2
Allocation_Amt_Allocation_Sheet             0
Expended_Amt_Allocation_Sheet               0
SB1_Funding                                 1
SB1_Budget_Year                            60
GGRF_Funding                                0
GGRF_Budget_Year                           43
CTC_Financial_Resolution                    9
CTC_Allocation_Amendment                  186
CTC_Waiver                                208
CalSTA_Waiver                             197
PSA_#                                       8
CT_Document_#                              33
Date_Branch_Chief_Receives_PSA            209
Date_Regional_Coordinator_Receives

In [14]:
#keep only columns that are relevant.
allocation_df = allocation[['Award_Year','Project_ID','PPNO']]

In [15]:
f'There are {allocation_df.Project_ID.nunique()} unique project ids in allocation'

'There are 191 unique project ids in allocation'

In [16]:
#only keep rows that have Project IDS filled in
allocation_df = allocation_df.loc[allocation_df['Project_ID'].notnull()]

In [17]:
allocation_df.shape

(208, 3)

In [18]:
joined_alloc = pd.merge(allocation_df, enterprise_df_project_id, 
                        left_on='Project_ID', right_on='project_id', 
                        how = 'outer', indicator = True)

In [19]:
joined_alloc._merge.value_counts()

both          119
left_only      89
right_only      1
Name: _merge, dtype: int64

In [74]:
joined_alloc.sample(10)

Unnamed: 0,Award_Year,Project_ID,PPNO,project_id,cycle,tot_exp,_merge
44,2016.0,18000237,CP017,18000237.0,Two,12705054.75,both
90,2018.0,19000071,CP042,,,,left_only
58,2018.0,20000185,CP028,,,,left_only
25,2016.0,21000266,CP076,,,,left_only
164,2018.0,\n0019000457,CP033,,,,left_only
177,2018.0,21000200,CP033,,,,left_only
19,2016.0,17000181,CP023,,,,left_only
59,2018.0,19000064,CP028,19000064.0,Three,71539.79,both
165,2018.0,20000207,CP033,,,,left_only
97,2018.0,19000294,CP042,,,,left_only


In [21]:
#only keep "both" results...
joined_alloc2 = joined_alloc.loc[joined_alloc['_merge'] == 'both'].drop(columns =['_merge'])

In [22]:
#group by so that only one row for each PPNO number & Year
joined_alloc3 = (joined_alloc2
                  .groupby(['PPNO', 'Award_Year'])
                  .agg({'tot_exp':'sum'})
                  .reset_index()
                 )

In [23]:
joined_alloc3.shape

(51, 3)

In [24]:
#rename cols before joining with "joined" df 
joined_alloc3 = joined_alloc3.add_prefix('from_joined_alloc_df_')

In [25]:
joined_alloc3

Unnamed: 0,from_joined_alloc_df_PPNO,from_joined_alloc_df_Award_Year,from_joined_alloc_df_tot_exp
0,1155,2018.0,486509.7
1,1230,2016.0,9204000.0
2,2320B,2018.0,500000.0
3,CP001,2015.0,6315208.89
4,CP002,2015.0,38023039.68
5,CP003,2015.0,4000000.0
6,CP004,2015.0,2320000.0
7,CP005,2015.0,22284205.53
8,CP006,2015.0,41181000.0
9,CP006,2016.0,45092000.0


In [26]:
joined_alloc3.astype({'from_joined_alloc_df_Award_Year': 'int64'}).dtypes

from_joined_alloc_df_PPNO           object
from_joined_alloc_df_Award_Year      int64
from_joined_alloc_df_tot_exp       float64
dtype: object

## Joining original sheets on Award Year & PPNO

In [75]:
project = TIRCP_functions.project()

In [78]:
project.isna().sum()

Award_Year                                         0
Project_#                                          0
Grant_Recipient                                    0
Project_Title                                      0
District                                           3
County                                             1
Project_Description                                0
Master_Agreement_Number                            2
Master_Agreement_Expiration_Date                   2
Project_Manager                                    0
Regional_Coordinator                               0
Technical_Assistance-CALITP_(Y/N)                 10
Technical_Assistance-Fleet_(Y/N)                  12
Technical_Assistance-Network_Integration_(Y/N)    11
Technical_Assistance-Priority_Population_(Y/N)    11
Total_Project_Cost                                 0
TIRCP_project_sheet                                0
Allocated_Amount                                   0
Unallocated_amt_project_sheet                 

In [31]:
final_join = pd.merge(project, joined_alloc3, 
                      left_on = ['PPNO', 'Award_Year'], 
                      right_on = ['from_joined_alloc_df_PPNO', 'from_joined_alloc_df_Award_Year'],
                      how = 'left', indicator = True)

In [32]:
final_join._merge.value_counts()

both          50
left_only     25
right_only     0
Name: _merge, dtype: int64

#### Check to see if PPNO matches  cross original project sheet & joined 

In [33]:
final_join.shape

(75, 36)

In [34]:
final_join.PPNO.nunique()

69

In [35]:
project.PPNO.nunique()

69

In [36]:
PPNO_project = set(project.PPNO.unique().tolist())
PPNO_join = set(final_join.PPNO.unique().tolist())
PPNO_join - PPNO_project 

set()

### Checking out our 2 columns of expended amounts to make sure Data Link information makes sense.
* Expended_Amt_Project_sheet is from Highlands sheet.
* from_joined_alloc_df_tot_exp from Data Link

In [37]:
pd.options.display.float_format = '{:,}'.format
pd.options.display.float_format = "{:.2f}".format

In [38]:
final_join[['Award_Year','PPNO','Project_Title', 'TIRCP_project_sheet', 'Expended_Amt_project_sheet', 'from_joined_alloc_df_tot_exp', '_merge']].sort_values('PPNO')

Unnamed: 0,Award_Year,PPNO,Project_Title,TIRCP_project_sheet,Expended_Amt_project_sheet,from_joined_alloc_df_tot_exp,_merge
57,2018,1155,Extend rail service to Monterey County,10148000,0.0,486509.7,both
24,2016,1230,Redlands Passenger Rail Project,9204000,9204000.0,9204000.0,both
29,2018,2320B,Purchase Zero Emission High Capacity Buses to Support Transbay Tomorrow and Clean Corridors Plan,14000000,500000.0,500000.0,both
6,2015,CP001,Sacramento Regional Transit's Refurbishment of 7 Light Rail Vehicles,6427000,0.0,6315208.89,both
12,2015,CP002,Purchase of Nine Fuel-Efficient Tier IV EMD F-125 Locomotives,41181000,37583067.0,38023039.68,both
7,2015,CP003,South Bay Bus Rapid Transit,4000000,4000000.0,4000000.0,both
5,2015,CP004,Bravo! Route 560 Rapid Buses,2320000,0.0,2320000.0,both
0,2015,CP005,Regional Transit Interconnectivity & Environmental Sustability,24403000,21714177.53,22284205.53,both
25,2016,CP006,Light Rail Modernization and Expansion Program,45092000,0.0,45092000.0,both
47,2018,CP006,Transit Capacity Expansion Program,26867000,0.0,26867000.0,both


In [39]:
#fill in with 0
final_join['from_joined_alloc_df_tot_exp'] = final_join['from_joined_alloc_df_tot_exp'].fillna(0)

In [102]:
def tableau2(df):
    #Keeping only the columns we want
    df = (df[['PPNO','Award_Year', 'Project_#', 'Grant_Recipient', 
       'Project_Title', 'District', 'County', 'Project_Description',
       'Master_Agreement_Number', 'Master_Agreement_Expiration_Date',
       'Project_Manager', 'Regional_Coordinator', 'Technical_Assistance-Fleet_(Y/N)',
       'Technical_Assistance-Network_Integration_(Y/N)',
       'Technical_Assistance-Priority_Population_(Y/N)', 'Total_Project_Cost',
       'TIRCP_project_sheet', 'Allocated_Amount',
       'Percentage_Allocated',
       'from_joined_alloc_df_tot_exp', 'Other_Funds_Involved']]
                 )
    #Getting percentages & filling in with 0
    df['Expended_Percent'] = df['from_joined_alloc_df_tot_exp']/df['Allocated_Amount']
    df['Allocated_Percent'] = df['Allocated_Amount']/df['TIRCP_project_sheet']
    
    # Subtract TIRCP with Allocated Amount with Unallocated
    df['Unallocated_Amount'] = df["TIRCP_project_sheet"] - df["Allocated_Amount"]
    #filling in for 0's 
    df[['Expended_Percent','Allocated_Percent']] = df[['Expended_Percent','Allocated_Percent']].fillna(value=0)
   
    df[['Expended_Percent','Allocated_Percent']]  =  df[['Expended_Percent','Allocated_Percent']].replace(np.inf, 0)
    #Categorizing expended percentage into bins
    def expended_percent(row):
           
            if ((row.Expended_Percent > 0) and (row.Expended_Percent < .26)):
                return "1-25"
            elif ((row.Expended_Percent > .25) and (row.Expended_Percent < .51)):
                return "26-50"
            elif ((row.Expended_Percent > .50) and (row.Expended_Percent < .76)):
                return "51-75"
            elif ((row.Expended_Percent > .75) and (row.Expended_Percent < 1.0)):
                return "76-99"
            elif row.Expended_Percent == 0.0:
                return "0"
            else:
                return "100"
    df["Expended_Percent_Group"] = df.apply(lambda x: expended_percent(x), axis=1)
    
    # Categorize years and expended_percent_group into bins
    def progress(df):   
        ### 2015 ### 
        if (df['Award_Year'] == 2015) and (df['Expended_Percent_Group'] == "1-25") | (df['Expended_Percent_Group'] == "26-50"):
            return 'Behind'
        elif (df['Award_Year'] == 2015) and (df['Expended_Percent_Group'] == "76-99") | (df['Expended_Percent_Group'] == "51-75"):
            return 'On Track'
        
        ### 2016 ###
        elif (df['Award_Year'] == 2016) and (df['Expended_Percent_Group'] == "1-25") | (df['Expended_Percent_Group'] == "26-50"):
            return 'Behind'
        elif (df['Award_Year'] == 2016) and (df['Expended_Percent_Group'] == "51-75") | (df['Expended_Percent_Group'] == "76-99"):
            return 'On Track'
        
        ### 2018 ###
        elif (df['Award_Year'] == 2018) and (df['Expended_Percent_Group'] == "1-25"):
            return 'Behind'
        elif (df['Award_Year'] == 2018) and (df['Expended_Percent_Group'] == "26-50") | (df['Expended_Percent_Group'] == "51-75"):
            return 'On Track'
        elif (df['Award_Year'] == 2018) and (df['Expended_Percent_Group'] == "76-99"):
            return 'Ahead'
        
        ### 2020 ### 
        elif (df['Award_Year'] == 2020) and (df['Expended_Percent_Group'] == "1-25"):
            return 'Behind'
        elif (df['Award_Year'] == 2020) and (df['Expended_Percent_Group'] == "26-50"):
            return 'On Track'
        elif (df['Award_Year'] == 2020) and (df['Expended_Percent_Group'] == "51-75") | (df['Expended_Percent_Group'] == "76-99"):
            return 'Ahead'
        
        ### 0 Expenditures ### 
        elif df ['Expended_Percent_Group'] == "0":
            return "No expenditures recorded"
        
        ### Else ### 
        else: 
            return "100% of allocated funds spent"

    df['Progress'] = df.apply(progress, axis = 1)
    
    #Which projects are large,small, medium
    p75 = df.TIRCP_project_sheet.quantile(0.75).astype(float)
    p25 = df.TIRCP_project_sheet.quantile(0.25).astype(float)
    p50 = df.TIRCP_project_sheet.quantile(0.50).astype(float)
    
    def project_size (row):
        if ((row.TIRCP_project_sheet > 0) and (row.TIRCP_project_sheet < p25)):
            return "Small"
        elif ((row.TIRCP_project_sheet > p25) and (row.TIRCP_project_sheet < p75)):
            return "Medium"
        elif ((row.TIRCP_project_sheet > p50) and (row.TIRCP_project_sheet > p75 )):
            return "Large"
        else:
            return "$0 recorded for TIRCP"
        
    df["Project_Category"] = df.apply(lambda x: project_size(x), axis=1)
     #Rename cols to the right names
    df = (df.rename(columns = {'from_joined_alloc_df_tot_exp':'Expended_Amount', 
                                                'TIRCP_project_sheet': "TIRCP_Amount"}
                  ))
    ### GCS ###
   # df = df.to_excel(f'{GCS_FILE_PATH}INTERIM_EXPENDITURE_Tableau_Parquet.xlsx')
  
    return df 

In [98]:
final_df = tableau2(final_join)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [116]:
final_df[['Award_Year','PPNO','District','Project_Title', 'TIRCP_Amount','Allocated_Amount','Expended_Amount','Expended_Percent','Progress']].sort_values('Expended_Percent')

Unnamed: 0,Award_Year,PPNO,District,Project_Title,TIRCP_Amount,Allocated_Amount,Expended_Amount,Expended_Percent,Progress
37,2018,CP054,7,Electric Blue: Electrification of City of Santa Monica's Big Blue Bus,3050000,3050000,0.0,0.0,No expenditures recorded
28,2016,CP026,10,SB 132 ACE Extension Lathrop to Ceres/Merced,400000000,0,0.0,0.0,No expenditures recorded
36,2018,CP029,7,Los Angeles City: Leading the Transformation to Zero-Emission Electric Bus Transit Service,36104000,36104000,0.0,0.0,No expenditures recorded
73,2020,CP074,10,"Improving Air Quality & Economic Growth with Electric Buses in Merced County, the Gateway to Yosemite",3112000,3112000,0.0,0.0,No expenditures recorded
39,2018,CP030,7,Los Angeles Region Transit System Integration and Modernization Program of Projects,1088499000,261200000,0.0,0.0,No expenditures recorded
45,2018,CP077,11,Ride Between the Line: Enhancing Access to Transit in San Diego,5763000,0,0.0,0.0,No expenditures recorded
49,2018,CP078,4,SamTrans Express Bus Pilot,15000000,0,0.0,0.0,No expenditures recorded
52,2018,CP057,,"VTA’s BART Silicon Valley Extension, Phase II",730000000,0,0.0,0.0,No expenditures recorded
53,2018,CP045,2,North State Intercity Bus System,8641000,8641000,0.0,0.0,No expenditures recorded
58,2020,CP059,7,"Reaching the Most Transit-Vulnerable: AVTA's Zero Emission ""Microtransit"" & Bus Expansion Proposal",6503000,6503000,0.0,0.0,No expenditures recorded


In [105]:
final_df.Progress.value_counts()

No expenditures recorded         25
On Track                         19
Behind                           14
100% of allocated funds spent    12
Ahead                             5
Name: Progress, dtype: int64

In [115]:
final_df.isna().sum()

PPNO                                               0
Award_Year                                         0
Project_#                                          0
Grant_Recipient                                    0
Project_Title                                      0
District                                           3
County                                             1
Project_Description                                0
Master_Agreement_Number                            2
Master_Agreement_Expiration_Date                   2
Project_Manager                                    0
Regional_Coordinator                               0
Technical_Assistance-Fleet_(Y/N)                  12
Technical_Assistance-Network_Integration_(Y/N)    11
Technical_Assistance-Priority_Population_(Y/N)    11
Total_Project_Cost                                 0
TIRCP_Amount                                       0
Allocated_Amount                                   0
Percentage_Allocated                          

In [106]:
TIRCP = final_df['TIRCP_Amount'].sum()
f'The total amount of TIRCP is {TIRCP}'

'The total amount of TIRCP is 5837722000'

In [107]:
Alloc = final_df['Allocated_Amount'].sum()
f'The total amount of $ allocated is {Alloc}'

'The total amount of $ allocated is 2283509000'

In [108]:
Exp = final_df['Expended_Amount'].sum()
f'Total amount of expenditures is {Exp}'

'Total amount of expenditures is 749827692.08'

In [109]:
f'% of allocated money spent {Alloc/TIRCP}'

'% of allocated money spent 0.3911643959750053'

In [110]:
f'% of expended  by allocation {Exp/Alloc}'

'% of expended  by allocation 0.3283664273186574'

In [111]:
f'% of expended  by TIRCP {Exp/TIRCP}'

'% of expended  by TIRCP 0.1284452552005731'

In [113]:
#final_df.to_excel(f'{GCS_FILE_PATH}tableau_with_temporary_expenditure_sol.xlsx', index = False)