# TIRCP SAR Report
----


In [None]:
import pandas as pd
import math
from siuba import * 
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.float_format = "{:.2f}".format
import datetime

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"
FILE_NAME1 = "Raw_Project_Tracking_Sheet.xlsx"
project = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME1}")
FILE_NAME2 = "Allocation_Agreement.xlsx"
allocation = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME2}")

In [None]:
#cleaning up spaces in columns
project.columns = project.columns.str.strip().str.replace(' ', '_')
allocation.columns = allocation.columns.str.strip().str.replace(' ', '_')

In [None]:
#strip spaces in columns
project.columns = project.columns.map(lambda x: x.strip())
allocation.columns = allocation.columns.map(lambda x: x.strip())

## Keeping only relevant columns.

In [None]:
project.columns

In [None]:
#subsetting for only columns of interest
df_project = project[['Award_Year', 'Project_#','Local_Agency','Project_Title','PPNO',
'Key_Project_Elements','TIRCP_Award_Amount_($)','Expended_Amount','Allocated_Amount']]

In [None]:
allocation.columns

In [None]:
#subsetting for only columns of interest
df_allocation = allocation[['Expended_Amount','Award_Year','Award_Recipient', 'Implementing_Agency', 'Allocation_Amount', 'GGRF_Funding','PPNO','Phase', 'LED','Allocation_Date','Completion_Date','3rd_Party_Award_Date','Components','SB1_Funding']]

# Make Summary Page
### Table 2: Summary of Awards (Cumulative)
* Linda stated she only needs table 2.
* Linda says she is unsure where to get the completed project row from Highlands tracking sheet. Needs to be discussed among TIRCP team to decide if they will include.
* The data from the Highlands project tracking sheet does not match the TIRCP SAR report Linda gave me 


In [None]:
#pivot
summary = project.drop_duplicates().groupby(['Award_Year']).agg({'Project_#':'count','TIRCP_Award_Amount_($)':'sum', 
 'Allocated_Amount':'sum','Expended_Amount':'sum'}).reset_index()

In [None]:
#renaming columns to match report
summary = summary.rename(columns = {'Project_#':'Number_of_Awarded_Projects','TIRCP_Award_Amount_($)': 'Award_Amount','Allocated_Amount':'Amount_Allocated'})

In [None]:
summary

* Can't do this more neatly with "assign"

In [None]:
#create percentages
summary['Expended_Percent_of_Awarded'] = (summary['Expended_Amount']/summary['Award_Amount'])*100
summary['Expended_Percent_of_Allocated'] = (summary['Expended_Amount']/summary['Amount_Allocated'])*100
summary['Percent_Allocated'] = (summary['Amount_Allocated']/summary['Award_Amount'])*100

In [None]:
#transpose 
summary_transposed = summary.set_index('Award_Year').T

In [None]:
#grand totals for monetary columns
list_to_add = ['Award_Amount','Amount_Allocated','Expended_Amount', 'Number_of_Awarded_Projects']

In [None]:
summary_transposed['Grand_Total']=summary_transposed.loc[list_to_add, :].sum(axis=1)

In [None]:
#grand total variables of each monetary column to fill in percentages below.
Exp = summary_transposed.at['Expended_Amount','Grand_Total']
Alloc = summary_transposed.at['Amount_Allocated','Grand_Total']
TIRCP = summary_transposed.at['Award_Amount','Grand_Total']

In [None]:
#filling in totals of percentages
summary_transposed.at['Expended_Percent_of_Awarded','Grand_Total'] = (Exp/TIRCP)*100
summary_transposed.at['Expended_Percent_of_Allocated','Grand_Total'] = (Exp/Alloc)*100
summary_transposed.at['Percent_Allocated','Grand_Total'] = (Alloc/TIRCP)*100

In [None]:
#switching rows to correct order
summary_transposed = summary_transposed.reindex(['Number_of_Awarded_Projects', 'Award_Amount', 'Amount_Allocated','Percent_Allocated','Expended_Amount', 'Expended_Percent_of_Awarded', 'Expended_Percent_of_Allocated'])

# Cleaning Allocation Sheet 

## Cleaning up PPNO, can only be 5 characters.

In [None]:
#remove the extra characters in PPNO in allocation to match the PPNO in project data frame bc there should only be five characters and numbers in each PPNO value
df_allocation = df_allocation.assign(
    PPNO_New = df_allocation['PPNO'].str.slice(start=0, stop=5)
)

In [None]:
#CSV with PPNO & Award Recipients
FILE_NAME2 = "Allocation_PPNO_Crosswalk.csv"
allocation_ppno = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME2}")

In [None]:
allocation_ppno #printing to make sure it makes sense.

In [None]:
#Merge in Crosswalk 
df_allocation = pd.merge(df_allocation, allocation_ppno, on = ["Award_Year", "Award_Recipient"], how = "left")

In [None]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_allocation.PPNO_New = df_allocation.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [None]:
#drop old column
df_allocation = df_allocation.drop(['PPNO'], axis=1)

In [None]:
df_allocation.columns

In [None]:
#renaming columns to something neater
df_allocation = df_allocation.rename(columns = {'Expended_Amount': 'Expended_ALLOCATION','PPNO_New':'PPNO', '3rd_Party_Award_Date':'Third_Party_Award_Date'})

In [None]:
#Filtering out for 2021, since that entry is blank
df_allocation = df_allocation.query("Award_Year != 2021")

## Cleaning up completion, allocation, 3rd Party dates, & LED dates

In [None]:
df_allocation.LED.unique().tolist()

In [None]:
#changing some of the dates
df_allocation["Allocation_Date"].replace({'October 15, 2018\nSeptember 30, 2021': '2018-10-15 00:00:00',
 '2/1/2021\n\n10/31/2022':'2021-02-01 00:00:00', '45211':'2023-10-22'}, inplace =True)

In [None]:
#cleaning up allocation dates
df_allocation.Allocation_Date.unique().tolist()

In [None]:
#Had to change FY to an actual date 
df_allocation["Allocation_Date"].replace({"FY 26/27": "2026-12-31", "08/12//20": '2020-08-12 00:00:00', 'FY 21/22': '2021-12-31',
                                         'FY 22/23': '2022-12-31','FY 20/21': '2020-12-31', 'FY 23/24': '2023-12-31','FY 24/25': '2024-12-31','FY 25/26': '2025-12-31'}, inplace =True)

In [None]:
#clean up columns in a loop
for i in ["Allocation_Date", "Third_Party_Award_Date", "Completion_Date", "LED"]:
    df_allocation[i] = df_allocation[i].replace('/', '-', regex = True).replace('Complete', '', regex = True).replace('\n', '', regex=True).replace('Pending','TBD',regex= True).fillna('TBD')

In [None]:
df_allocation.Completion_Date.unique().tolist()

In [None]:
#cleaning up completion dates
df_allocation['Completion_Date'].replace({ 
    'June 24. 2024': '2024-06-01 00:00:00',  
    '11/21/2024\n7/30/2025 (Q4)': '2024-11-21 00:00:00', 
    'Jun-26': '2026-01-01 00:00:00', 
     'Jun-29': '2029-06-01 00:00:00',
    'Complete\n11/12/2019': '2019-11-12 00:00:00' , 
    'Deallocated': '', 
    'Jun-28': '2028-06-01 00:00:00',  
    'Jun-25': '2025-06-01 00:00:00', 
    'Jun-23':'2023-06-01 00:00:00', 
    'Jun-27': '2027-06-01 00:00:00',
    'Jan-25': '2025-01-01 00:00:00',
    '11-21-20247-30-2025 (Q4)':'2025-07-30 00:00:00',
    '6-30-202112-31-2021': '2021-12-31 00:00:00',
    '6-1-2019': '2019-06-01 00:00:00',
    '2-11-2018': '2018-02-11 00:00:00',
     '6-30-2020': '2020-06-30 00:00:00',
    ' 6-30-2018': '2018-06-30 00:00:00',
     '6-29-2020': '2020-06-29 00:00:00',
     '11-1-2019': '2019-11-01 00:00:00',
     ' 12-10-2018': '2018-12-10 00:00:00',
     ' 11-13-2019': '2019-11-13 00:00:00',
     '3-30-2020':'2020-03-30 00:00:00',
    ' 6-30-2020': '2020-06-30 00:00:00',
    '11-12-2019': '2019-11-12 00:00:00',
    '1-31-2020': '2020-01-31 00:00:00',
    '8-30-2020': '2020-08-30 00:00:00',
    '5-16-2020': '2020,05-16 00:00:00',
     '5-7-2020': '2020-05-07 00:00:00'}, inplace =True)

In [None]:
#cleaning up 3rd Party dates
df_allocation["Third_Party_Award_Date"].replace({ 
'Augsut 12, 2021': '2021-08-12 00:00:00',
'43435': '2018-12-01 00:00:00',
'07-29-2020': '2020-07-29 00:00:00',
'43497' : '2019-02-01 00:00:00',
'TBD 6-24-2021' : 'TBD',
'TBD 6-30-2022' : 'TBD'
}, inplace =True)

In [None]:
#coercing to dates
df_allocation = df_allocation.assign(
    Allocation_Date_New = pd.to_datetime(df_allocation.Allocation_Date, errors="coerce").dt.date,
    Third_Party_Award_Date_New = pd.to_datetime(df_allocation.Third_Party_Award_Date, errors="coerce").dt.date,
    Completion_Date_New = pd.to_datetime(df_allocation.Completion_Date, errors="coerce").dt.date,
    LED_New = pd.to_datetime(df_allocation.LED, errors="coerce").dt.date
)

In [None]:
#drop old columns
df_allocation = df_allocation.drop(['Allocation_Date','Third_Party_Award_Date','Completion_Date', 'LED'], axis=1)
#rename columns
df_allocation = df_allocation.rename(columns = {'Allocation_Amount':'Allocation_Allocation_Sheet', 'Allocation_Date_New':'Allocation_Date','Third_Party_Award_Date_New':'Third_Party_Award_Date',
                                               'Completion_Date_New': 'Completion_Date','LED_New': 'LED'})

## Cleaning up Expended Amount
* Have to divide expended by allocated amount, cannot divide by 0. 
* 'Deallocation' is changed to 0

In [None]:
df_allocation["Expended_ALLOCATION"].replace({'Deallocation': 0}, inplace=True)

In [None]:
df_allocation.Expended_ALLOCATION.unique().tolist()

# Cleaning Project Sheet



## Filling NA for TIRCP and Expended Amounts

In [None]:
df_project[['TIRCP_Award_Amount_($)', 'Expended_Amount']] = df_project[['TIRCP_Award_Amount_($)', 'Expended_Amount']].fillna(value=0)

In [None]:
#checking for nas
df_project.isna().sum()

## Cleaning up PPNO Numbers based on Allocation Sheet

In [None]:
#slicing PPNO to be 5 characters
df_project = df_project.assign(PPNO_New = df_project['PPNO'].str.slice(start=0, stop=5)) 

In [None]:
#importing Excel crosswalk sheet
FILE_NAME3 = "Projects_PPNO.xlsx"
project_ppno = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME3}")

In [None]:
#Merge in Crosswalk 
df_project2 = pd.merge(df_project, project_ppno, on = ["Award_Year", "Local_Agency"], how = "left")

In [None]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_project2.PPNO_New = df_project2.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [None]:
#making sure PPNO_New is a string 
df_project2 = df_project2.astype({'PPNO_New': 'str'})

In [None]:
PPNO_project = set(df_project2.PPNO_New.unique().tolist())
PPNO_allocation = set(df_allocation.PPNO.unique().tolist())

In [None]:
#checking for differences - none. yay. 
PPNO_project - PPNO_allocation

In [None]:
df_project2.head(2)

In [None]:
#drop old column
df_project2 = df_project2.drop(['PPNO', 'PPNO_New2'], axis=1)

In [None]:
#renaming to something neater
df_project2 = df_project2.rename(columns = {'PPNO_New':'PPNO', 'Allocated_Amount':'Allocation_Amount_PROJECT'})

In [None]:
df_allocation.head(2)

# Merging Project & Allocations
* Merge on PPNO & Award_Year...using projects on the left? 
* Proper way to drop duplicates? 
* Use https://docs.python.org/3/library/uuid.html to identify each row or row number or grab .index and put it ina column.

In [None]:
#merge on left for projects.
df_combined = df_allocation.merge(df_project2, how = "left", on = ["PPNO", "Award_Year"])

In [None]:
df_combined.shape

In [None]:
#dropping all duplicates...every column that is a duplicate is droped
df_combined2 = df_combined.drop_duplicates() 

In [None]:
df_combined2.shape

### Filling in NA for monetary columns

In [None]:
#Just filling in NA values
df_combined2[['TIRCP_Award_Amount_($)','SB1_Funding']] = df_combined2[['TIRCP_Award_Amount_($)', 'SB1_Funding']].fillna(value=0)

In [None]:
#drop columns that aren't needed
df_combined2 = df_combined2.drop(columns=['Local_Agency','Key_Project_Elements'])

In [None]:
df_combined2.columns

### Calculate out Percent of Allocation Expended  & Percent of Award Fully Allocated 

In [None]:
df_combined2 = df_combined2.assign(
    Percent_of_Allocation_Expended = (df_combined2['Expended_ALLOCATION']/df_combined2['Allocation_Allocation_Sheet'])*100,
    Percent_of_Award_Fully_Allocated = (df_combined2['Allocation_Amount_PROJECT']/df_combined2['TIRCP_Award_Amount_($)'])*100)

In [None]:
cols = ['Expended_ALLOCATION','Allocation_Allocation_Sheet','TIRCP_Award_Amount_($)','Expended_Amount','GGRF_Funding','SB1_Funding','Percent_of_Allocation_Expended', 'Percent_of_Award_Fully_Allocated']

In [None]:
#coercing monetary to numeric 
df_combined2[cols] = df_combined2[cols].apply(pd.to_numeric, errors='coerce')

In [None]:
#Re name the columns exactly like the sheet
df_combined2 = df_combined2.rename(columns = {'LED': 'Phase_Completion_Date','SB1_Funding':'PTA-SB1 Allocation Amount',  'Percentge_Allocated': 'Percentage Allocated', 'TIRCP_Award_Amount_($)': 'TIRCP_Award_Amount',
'Third_Party_Award_Date':'CON_Contract_Award_Date'})

In [None]:
df_combined2.shape

### Filing in NA dates with a super fake one


In [None]:
df_combined2.isna().sum()

In [None]:
#fill in missing dates with a fake one
missing_date = pd.to_datetime('2100-01-01')

In [None]:
for i in ["Allocation_Date", "CON_Contract_Award_Date", "Completion_Date", "Phase_Completion_Date"]:
    df_combined2[i] = df_combined2[i].fillna(missing_date)

In [None]:
#force to date time
df_combined2[['Allocation_Date', 'Completion_Date','Phase_Completion_Date', 'CON_Contract_Award_Date']] = df_combined2[['Allocation_Date', 'Completion_Date','Phase_Completion_Date', 'CON_Contract_Award_Date']].apply(pd.to_datetime)

### Create Allocated before July 2020 Date Column 
* If the allocation date is AFTER  7-31-2020 then 0, if BEFORE 7-31-2020 then 1

In [None]:
#if the allocation date is AFTER  7-31-2020 then 0, if BEFORE 7-31-2020 then 1
df_combined2 = df_combined2.assign( Allocated_Before_July_31_2020_1_is_yes = df_combined2.apply(lambda x: 0 if x.Allocation_Date > pd.Timestamp(2020, 7, 31, 0) else 1, axis=1))

# Mimic sheet


In [None]:
df_pivot = df_combined2.groupby(['Award_Year','Project_#','Award_Recipient','Project_Title','TIRCP_Award_Amount','Percent_of_Award_Fully_Allocated','Implementing_Agency', 'Components','PPNO','Phase',"Allocation_Date", 
 "CON_Contract_Award_Date", "Completion_Date", "Phase_Completion_Date", ]).agg({'Allocation_Allocation_Sheet': 'sum', 
'GGRF_Funding':'sum',
'Expended_ALLOCATION':'sum',
'PTA-SB1 Allocation Amount':'sum',
'Percent_of_Allocation_Expended':'max',                                                                                                               
'Allocated_Before_July_31_2020_1_is_yes':'max',
})

In [None]:
df_pivot.tail(10)

# Export into Excel
* Dataframes to export: summary_transposed and df_pivot
* https://www.geeksforgeeks.org/how-to-write-pandas-dataframes-to-multiple-excel-sheets/

In [None]:
#one final neaten up
df_pivot = df_pivot.rename(columns = {'Expended_ALLOCATION': 'Expended_Amount', 'Allocation_Allocation_Sheet': 'Allocation_Amount'})

In [None]:
with pd.ExcelWriter("gs://calitp-analytics-data/data-analyses/tircp/TIRCP_SAR_2022.xlsx") as writer:
    summary_transposed.to_excel(writer, sheet_name="Summary", index=True)
    df_pivot.to_excel(writer, sheet_name="FY", index=True)