# Expenses in Highlands Sheet are incorrect
* Using expenses from Data link to fill in expenditures for March 25 presentation


In [1]:
import numpy as np
import pandas as pd
import TIRCP_functions
from calitp_data_analysis.sql to_snakecase

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"

pd.options.display.max_columns = 50
pd.options.display.max_rows = 120
pd.set_option('display.max_colwidth', None)

E0316 20:20:50.324598556    1219 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0316 20:20:50.840796541    1219 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


## Expenditure information from the Enterprise Data Link data Linda pulled.

In [2]:
enterprise = "Enterprise.xlsx"
enterprise_df = pd.read_excel(f"{GCS_FILE_PATH}{enterprise}")
enterprise_df = to_snakecase(enterprise_df)

In [3]:
enterprise_df.head(2)

Unnamed: 0,fy,cycle,fund,appr_catg,appr_unit,pec,project,project_name,project_short_name,phase,old_ea,capital_exp,local_exp,support_exp,other_exp,tot_exp
0,2016,One,3228,1415,15101,3010070,16000007,Refurbishment of Seven Light Rail Vehicles,64-R340GA,S,R340GA,0,2182302.44,0,0,2182302.44
1,2016,One,3228,1415,15101,3010070,16000008,South Bay Bus Rapid Transit (BRT) Project,64-T339GA,S,T339GA,0,30770.46,0,0,30770.46


In [4]:
enterprise_df = enterprise_df.rename(columns = {'project':'project_id'})

In [5]:
#keep only columns necessary.
enterprise_df = enterprise_df[['fy','cycle','project_id','project_name','tot_exp']]

In [6]:
#Looking at IDS by cycles.
enterprise_df.cycle.value_counts()

Three    152
One       47
Two       47
Four       8
Name: cycle, dtype: int64

In [7]:
enterprise_df.shape

(254, 5)

#### Aggregate by project ID and each cycle, so each project has only one record for total expenses

In [8]:
enterprise_df_project_id = enterprise_df.groupby(['project_id','cycle']).agg({'tot_exp':'sum'}).reset_index()

In [9]:
enterprise_df_project_id.shape

(106, 3)

In [10]:
enterprise_df_project_id.head(2)

Unnamed: 0,project_id,cycle,tot_exp
0,16000007,One,6315208.89
1,16000008,One,4000000.0


## Merge in with allocation sheet of Excel workbook 
* There are 191 unique project IDS in our allocation sheet compared to 106 project IDS in Data Link.

In [11]:
#read in allocation df because that's the sheet with project ids. 
allocation = TIRCP_functions.allocation()

In [12]:
#keep only columns that are relevant.
allocation_df = allocation[['Award_Year','Project_ID','PPNO']]

In [13]:
#checking NA values
allocation_df.isna().sum()

Award_Year    2
Project_ID    7
PPNO          2
dtype: int64

In [14]:
f'There are {allocation_df.Project_ID.nunique()} unique project ids in allocation'

'There are 191 unique project ids in allocation'

In [15]:
#only keep rows that have Project IDS filled in
allocation_df = allocation_df.loc[allocation_df['Project_ID'].notnull()]

In [16]:
allocation_df.shape

(208, 3)

In [17]:
joined_alloc = pd.merge(allocation_df, enterprise_df_project_id, 
                        left_on='Project_ID', right_on='project_id', 
                        how = 'outer', indicator = True)

In [18]:
joined_alloc._merge.value_counts()

both          119
left_only      89
right_only      1
Name: _merge, dtype: int64

In [19]:
#only keep "both" results...
joined_alloc2 = joined_alloc.loc[joined_alloc['_merge'] == 'both'].drop(columns =['_merge'])

In [20]:
#group by so that only one row for each PPNO number & Year
joined_alloc3 = (joined_alloc2
                  .groupby(['PPNO', 'Award_Year'])
                  .agg({'tot_exp':'sum'})
                  .reset_index()
                 )

In [21]:
joined_alloc3.shape

(51, 3)

In [22]:
#rename cols before joining with "joined" df 
joined_alloc3 = joined_alloc3.add_prefix('from_joined_alloc_df_')

In [23]:
joined_alloc3

Unnamed: 0,from_joined_alloc_df_PPNO,from_joined_alloc_df_Award_Year,from_joined_alloc_df_tot_exp
0,1155,2018.0,486509.7
1,1230,2016.0,9204000.0
2,2320B,2018.0,500000.0
3,CP001,2015.0,6315208.89
4,CP002,2015.0,38023039.68
5,CP003,2015.0,4000000.0
6,CP004,2015.0,2320000.0
7,CP005,2015.0,22284205.53
8,CP006,2015.0,41181000.0
9,CP006,2016.0,45092000.0


In [24]:
joined_alloc3.astype({'from_joined_alloc_df_Award_Year': 'int64'}).dtypes

from_joined_alloc_df_PPNO           object
from_joined_alloc_df_Award_Year      int64
from_joined_alloc_df_tot_exp       float64
dtype: object

## Joining project sheet on Award Year & PPNO

In [25]:
project = TIRCP_functions.project()

In [26]:
final_join = pd.merge(project, joined_alloc3, 
                      left_on = ['PPNO', 'Award_Year'], 
                      right_on = ['from_joined_alloc_df_PPNO', 'from_joined_alloc_df_Award_Year'],
                      how = 'left', indicator = True)

In [27]:
final_join._merge.value_counts()

both          50
left_only     25
right_only     0
Name: _merge, dtype: int64

#### Check to see if PPNO matches  cross original project sheet & joined 

In [28]:
final_join.shape

(75, 36)

In [29]:
final_join.PPNO.nunique()

69

In [30]:
project.PPNO.nunique()

69

In [31]:
PPNO_project = set(project.PPNO.unique().tolist())
PPNO_join = set(final_join.PPNO.unique().tolist())
PPNO_join - PPNO_project 

set()

In [32]:
pd.options.display.float_format = '{:,}'.format
pd.options.display.float_format = "{:.2f}".format

### Final Clean Up - per Linda, TIRCP isn't correct

In [33]:
#fill in with 0
final_join['from_joined_alloc_df_tot_exp'] = final_join['from_joined_alloc_df_tot_exp'].fillna(0)

In [34]:
final_join = final_join.drop_duplicates(subset=['PPNO','Award_Year', 'Project_Title'])

In [35]:
final_join.loc[(final_join['PPNO'] == 'CP028'), "TIRCP_project_sheet"] =  13156000 
final_join.loc[(final_join['PPNO'] == 'CP019'), "TIRCP_project_sheet"] =  8930000

#### Dropping a project that may or may not be considered TIRCP.

In [36]:
index = final_join[(final_join.PPNO == 'CP026') &( final_join.Award_Year == 2016)].index

In [37]:
final_join = final_join.drop(index)

In [38]:
final_join = final_join.reset_index()

### Checking out our 2 columns of expended amounts to make sure Data Link information makes sense.
* Expended_Amt_Project_sheet is from Highlands sheet.
* from_joined_alloc_df_tot_exp from Data Link

In [39]:
final_join[['Award_Year','PPNO','Project_Title', 'TIRCP_project_sheet', 
            'Expended_Amt_project_sheet', 'from_joined_alloc_df_tot_exp', '_merge']]

Unnamed: 0,Award_Year,PPNO,Project_Title,TIRCP_project_sheet,Expended_Amt_project_sheet,from_joined_alloc_df_tot_exp,_merge
0,2015,CP005,Regional Transit Interconnectivity & Environmental Sustability,24403000,21714177.53,22284205.53,both
1,2015,CP012,Travel Time Reduction Project,4620000,4619999.9,4619999.9,both
2,2015,CP015,Willowbrook/Rosa Parks Station & Blue Line Light Rail Operational Improvements Project,38494000,38494000.0,38494000.0,both
3,2015,CP007,Pacific Surfliner Transit Transfer Program,1675000,38494000.0,277840.47,both
4,2015,CP013,Monterey Bay Operations and Maintenance Facility/Salinas Transit Service Project,10000000,0.0,10000000.0,both
5,2015,CP004,Bravo! Route 560 Rapid Buses,2320000,0.0,2320000.0,both
6,2015,CP001,Sacramento Regional Transit's Refurbishment of 7 Light Rail Vehicles,6427000,0.0,6315208.89,both
7,2015,CP003,South Bay Bus Rapid Transit,4000000,4000000.0,4000000.0,both
8,2015,CP008,San Diego Metropolitan Transit System Trolley Capacity Improvements,31936000,31936000.0,31936000.0,both
9,2015,CP006,SFMTA Light Rail Vehicle Fleet Expansion,41181000,3760000.0,41181000.0,both


## Tableau

In [40]:
def tableau2(df):
    #Keeping only the columns we want
    df = (df[['PPNO','Award_Year', 'Project_#', 'Grant_Recipient', 
       'Project_Title', 'District', 'County', 'Project_Description',
       'Master_Agreement_Number', 'Master_Agreement_Expiration_Date',
       'Project_Manager', 'Regional_Coordinator', 'Technical_Assistance-Fleet_(Y/N)',
       'Technical_Assistance-Network_Integration_(Y/N)',
       'Technical_Assistance-Priority_Population_(Y/N)', 'Total_Project_Cost',
       'TIRCP_project_sheet', 'Allocated_Amount',
       'Percentage_Allocated',
       'from_joined_alloc_df_tot_exp', 'Other_Funds_Involved']]
                 )
    #Getting percentages & filling in with 0
    df['Expended_Percent'] = df['from_joined_alloc_df_tot_exp']/df['Allocated_Amount']
    df['Allocated_Percent'] = df['Allocated_Amount']/df['TIRCP_project_sheet']
    
    # Subtract TIRCP with Allocated Amount with Unallocated
    df['Unallocated_Amount'] = df["TIRCP_project_sheet"] - df["Allocated_Amount"]
    #filling in for 0's 
    df[['Expended_Percent','Allocated_Percent']] = df[['Expended_Percent','Allocated_Percent']].fillna(value=0)
   
    df[['Expended_Percent','Allocated_Percent']]  =  df[['Expended_Percent','Allocated_Percent']].replace(np.inf, 0)
    #Categorizing expended percentage into bins
    def expended_percent(row):
           
            if ((row.Expended_Percent > 0) and (row.Expended_Percent < .26)):
                return "1-25"
            elif ((row.Expended_Percent > .25) and (row.Expended_Percent < .51)):
                return "26-50"
            elif ((row.Expended_Percent > .50) and (row.Expended_Percent < .76)):
                return "51-75"
            elif ((row.Expended_Percent > .75) and (row.Expended_Percent < 1.0)):
                return "76-99"
            elif row.Expended_Percent == 0.0:
                return "0"
            else:
                return "100"
    df["Expended_Percent_Group"] = df.apply(lambda x: expended_percent(x), axis=1)
    
    # Categorize years and expended_percent_group into bins
    def progress(df):   
        ### 2015 ### 
        if (df['Award_Year'] == 2015) and (df['Expended_Percent_Group'] == "1-25") | (df['Expended_Percent_Group'] == "26-50"):
            return 'Behind'
        elif (df['Award_Year'] == 2015) and (df['Expended_Percent_Group'] == "76-99") | (df['Expended_Percent_Group'] == "51-75"):
            return 'On Track'
        
        ### 2016 ###
        elif (df['Award_Year'] == 2016) and (df['Expended_Percent_Group'] == "1-25") | (df['Expended_Percent_Group'] == "26-50"):
            return 'Behind'
        elif (df['Award_Year'] == 2016) and (df['Expended_Percent_Group'] == "51-75") | (df['Expended_Percent_Group'] == "76-99"):
            return 'On Track'
        
        ### 2018 ###
        elif (df['Award_Year'] == 2018) and (df['Expended_Percent_Group'] == "1-25"):
            return 'Behind'
        elif (df['Award_Year'] == 2018) and (df['Expended_Percent_Group'] == "26-50") | (df['Expended_Percent_Group'] == "51-75"):
            return 'On Track'
        elif (df['Award_Year'] == 2018) and (df['Expended_Percent_Group'] == "76-99"):
            return 'Ahead'
        
        ### 2020 ### 
        elif (df['Award_Year'] == 2020) and (df['Expended_Percent_Group'] == "1-25"):
            return 'Behind'
        elif (df['Award_Year'] == 2020) and (df['Expended_Percent_Group'] == "26-50"):
            return 'On Track'
        elif (df['Award_Year'] == 2020) and (df['Expended_Percent_Group'] == "51-75") | (df['Expended_Percent_Group'] == "76-99"):
            return 'Ahead'
        
        ### 0 Expenditures ### 
        elif df ['Expended_Percent_Group'] == "0":
            return "No expenditures recorded"
        
        ### Else ### 
        else: 
            return "100% of allocated funds spent"

    df['Progress'] = df.apply(progress, axis = 1)
    
    #Which projects are large,small, medium
    p75 = df.TIRCP_project_sheet.quantile(0.75).astype(float)
    p25 = df.TIRCP_project_sheet.quantile(0.25).astype(float)
    p50 = df.TIRCP_project_sheet.quantile(0.50).astype(float)
    
    def project_size (row):
        if ((row.TIRCP_project_sheet > 0) and (row.TIRCP_project_sheet < p25)):
            return "Small"
        elif ((row.TIRCP_project_sheet > p25) and (row.TIRCP_project_sheet < p75)):
            return "Medium"
        elif ((row.TIRCP_project_sheet > p50) and (row.TIRCP_project_sheet > p75 )):
            return "Large"
        else:
            return "$0 recorded for TIRCP"
        
    df["Project_Category"] = df.apply(lambda x: project_size(x), axis=1)
     #Rename cols to the right names
    df = (df.rename(columns = {'from_joined_alloc_df_tot_exp':'Expended_Amount', 
                                                'TIRCP_project_sheet': "TIRCP_Amount"}
                  ))
    ### GCS ###
    # df = df.to_excel(f'{GCS_FILE_PATH}INTERIM_EXPENDITURE_Tableau_Parquet.xlsx')
  
    return df 

In [41]:
final_df = tableau2(final_join)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [42]:
final_df[['Award_Year','Project_#','PPNO','Project_Title', 'Allocated_Amount','TIRCP_Amount',
          'Expended_Amount','Expended_Percent','Progress']].sort_values( by = ['Award_Year', 'PPNO'])

Unnamed: 0,Award_Year,Project_#,PPNO,Project_Title,Allocated_Amount,TIRCP_Amount,Expended_Amount,Expended_Percent,Progress
6,2015,7,CP001,Sacramento Regional Transit's Refurbishment of 7 Light Rail Vehicles,6427000,6427000,6315208.89,0.98,On Track
12,2015,13,CP002,Purchase of Nine Fuel-Efficient Tier IV EMD F-125 Locomotives,41181000,41181000,38023039.68,0.92,On Track
7,2015,8,CP003,South Bay Bus Rapid Transit,4000000,4000000,4000000.0,1.0,100% of allocated funds spent
5,2015,6,CP004,Bravo! Route 560 Rapid Buses,2320000,2320000,2320000.0,1.0,100% of allocated funds spent
0,2015,1,CP005,Regional Transit Interconnectivity & Environmental Sustability,24403000,24403000,22284205.53,0.91,On Track
9,2015,10,CP006,SFMTA Light Rail Vehicle Fleet Expansion,41181000,41181000,41181000.0,1.0,100% of allocated funds spent
3,2015,4,CP007,Pacific Surfliner Transit Transfer Program,1675000,1675000,277840.47,0.17,Behind
8,2015,9,CP008,San Diego Metropolitan Transit System Trolley Capacity Improvements,31936000,31936000,31936000.0,1.0,100% of allocated funds spent
13,2015,14,CP010,SMART Rail Car Capacity Project,11000000,11000000,11000000.0,1.0,100% of allocated funds spent
11,2015,12,CP011,Bus Rapid Transit – Martin Luther King Corridor and Crosstown Miner Corridor,6841000,6841000,6841000.0,1.0,100% of allocated funds spent


In [43]:
final_df.Progress.value_counts()

No expenditures recorded         24
On Track                         18
Behind                           14
100% of allocated funds spent    12
Ahead                             5
Name: Progress, dtype: int64

In [44]:
final_df.Progress.value_counts().sum()

73

In [45]:
TIRCP = final_df['TIRCP_Amount'].sum()
f'The total amount of TIRCP is {TIRCP}'

'The total amount of TIRCP is 5440171000'

In [46]:
Alloc = final_df['Allocated_Amount'].sum()
f'The total amount of $ allocated is {Alloc}'

'The total amount of $ allocated is 2266402000'

In [47]:
Exp = final_df['Expended_Amount'].sum()
f'Total amount of expenditures is {Exp}'

'Total amount of expenditures is 743099838.47'

In [48]:
f'% of allocated money spent {(Alloc/TIRCP).round(2)}'

'% of allocated money spent 0.42'

In [49]:
f'% of expended  by allocation {(Exp/Alloc).round(2)}'

'% of expended  by allocation 0.33'

In [50]:
f'% of expended  by TIRCP {(Exp/TIRCP).round(2)}'

'% of expended  by TIRCP 0.14'

In [51]:
final_df.to_excel(f'{GCS_FILE_PATH}tableau_with_temporary_expenditure_sol.xlsx', index = False)