# TIRCP SAR Report
----


In [1]:
import pandas as pd
import math
from siuba import * 
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.float_format = "{:.2f}".format
import datetime

In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"
FILE_NAME1 = "Raw_Project_Tracking_Sheet.xlsx"
project = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME1}")
FILE_NAME2 = "Allocation_Agreement.xlsx"
allocation = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME2}")

In [3]:
#cleaning up spaces in columns
project.columns = project.columns.str.strip().str.replace(' ', '_')
allocation.columns = allocation.columns.str.strip().str.replace(' ', '_')

In [4]:
#strip spaces in columns
project.columns = project.columns.map(lambda x: x.strip())
allocation.columns = allocation.columns.map(lambda x: x.strip())

## Keeping only relevant columns.

In [5]:
project.columns

Index(['Award_Year', 'Project_#', 'Local_Agency', 'Vendor_ID_#',
       'Project_Title', 'PPNO', 'District', 'County', 'Key_Project_Elements',
       'Master_Agreement_Number', 'Master_Agreement_Expiration_Date',
       'Project_Manager', 'Regional_Coordinator',
       'Technical_Assistance-CALTP_(Y/N)', 'Technical_Assistance-Fleet_(Y/N)',
       'Technical_Assistance-Network_Integration_(Y/N)',
       'Technical_Assistance-Priority_Population_(Y/N)', 'Total_Project_Cost',
       'TIRCP_Award_Amount_($)', 'Allocated_Amount', 'Unallocated_Amount',
       'Percentge_Allocated', 'Expended_Amount', 'Other_Funds_Involved',
       'Award_Cycle', 'Local_Agency_Address', 'Local_Agency_City',
       'Local_Agency_Zip', 'Local_Agency_Contact', 'Local_Agency_Email',
       'Local_Agency_Phone_Number', 'Comments/Additional_Contacts'],
      dtype='object')

In [6]:
#subsetting for only columns of interest
df_project = project[['Award_Year', 'Project_#','Local_Agency','Project_Title','PPNO',
'Key_Project_Elements','TIRCP_Award_Amount_($)','Expended_Amount','Allocated_Amount']]

In [7]:
allocation.columns

Index(['Award_Year', 'Project_#', 'Award_Recipient', 'Implementing_Agency',
       'PPNO', 'Project_ID', 'EA', 'Components', 'Phase', 'Allocation_Amount',
       'Expended_Amount', 'SB1_Funding', 'SB1_Budget_Year', 'GGRF_Funding',
       'GGRF_Budget_Year', 'CTC_Financial_Resolution',
       'CTC_Allocation_Amendment', 'CTC_Waiver', 'CTC_CalSTA_Waiver',
       'Allocation_Date', 'Completion_Date', 'PSA_#', 'CT_Document_#',
       '3rd_Party_Award_Date', 'LED', 'Date_Branch_Chief_Receives_PSA',
       'Date_Regional_Coordinator_Receives_PSA', 'Date_OC_Receives_PSA',
       'Date_OPM_Receives_PSA', 'Date_Legal_Receives_PSA',
       'Date_Returned_to_PM', 'Date_PSA_Sent_to_Local_Agency',
       'Date_PSA_Approved_by_Local_Agency', 'Date_Signed_by_DRMT',
       'PSA_Expiry_Date', 'LONP', 'Prior_Fiscal_Years_to_2020',
       'Fiscal_Year_2020-2021', 'Fiscal_Year_2021-2022',
       'Fiscal_Year_2022-2023', 'Fiscal_Year_2023-2024',
       'Fiscal_Year_2024-2025', 'Fiscal_Year_2025-2026',
    

In [8]:
#subsetting for only columns of interest
df_allocation = allocation[['Expended_Amount','Award_Year','Award_Recipient', 'Implementing_Agency', 'Allocation_Amount', 'GGRF_Funding','PPNO','Phase', 'LED','Allocation_Date','Completion_Date','3rd_Party_Award_Date','Components','SB1_Funding']]

# Make Summary Page
### Table 2: Summary of Awards (Cumulative)
* Linda stated she only needs table 2.
* Linda says she is unsure where to get the completed project row from Highlands tracking sheet. Needs to be discussed among TIRCP team to decide if they will include.
* The data from the Highlands project tracking sheet does not match the TIRCP SAR report Linda gave me 


In [9]:
#pivot
summary = project.drop_duplicates().groupby(['Award_Year']).agg({'Project_#':'count','TIRCP_Award_Amount_($)':'sum', 
 'Allocated_Amount':'sum','Expended_Amount':'sum'}).reset_index()

In [10]:
#renaming columns to match report
summary = summary.rename(columns = {'Project_#':'Number_of_Awarded_Projects',
                                   'TIRCP_Award_Amount_($)': 'Award_Amount','Allocated_Amount':'Amount_Allocated'})

In [11]:
summary

Unnamed: 0,Award_Year,Number_of_Awarded_Projects,Award_Amount,Amount_Allocated,Expended_Amount
0,2015,14,224328000.0,224278000,143556084.9
1,2016,15,788444000.0,422477816,119517737.55
2,2018,28,4251722000.0,1627802000,210456649.72
3,2020,17,500000000.0,74030000,1823462.51


* Can't do this more neatly with "assign"

In [12]:
#create percentages
summary['Expended_Percent_of_Awarded'] = (summary['Expended_Amount']/summary['Award_Amount'])*100
summary['Expended_Percent_of_Allocated'] = (summary['Expended_Amount']/summary['Amount_Allocated'])*100
summary['Percent_Allocated'] = (summary['Amount_Allocated']/summary['Award_Amount'])*100

In [13]:
#transpose 
summary_transposed = summary.set_index('Award_Year').T

In [14]:
#grand totals for monetary columns
list_to_add = ['Award_Amount','Amount_Allocated','Expended_Amount', 'Number_of_Awarded_Projects']

In [15]:
summary_transposed['Grand_Total']=summary_transposed.loc[list_to_add, :].sum(axis=1)

In [16]:
#grand total variables of each monetary column to fill in percentages below.
Exp = summary_transposed.at['Expended_Amount','Grand_Total']
Alloc = summary_transposed.at['Amount_Allocated','Grand_Total']
TIRCP = summary_transposed.at['Award_Amount','Grand_Total']

In [17]:
#filling in totals of percentages
summary_transposed.at['Expended_Percent_of_Awarded','Grand_Total'] = (Exp/TIRCP)*100
summary_transposed.at['Expended_Percent_of_Allocated','Grand_Total'] = (Exp/Alloc)*100
summary_transposed.at['Percent_Allocated','Grand_Total'] = (Alloc/TIRCP)*100

In [18]:
#switching rows to correct order
summary_transposed = summary_transposed.reindex(['Number_of_Awarded_Projects', 'Award_Amount', 'Amount_Allocated','Percent_Allocated','Expended_Amount', 'Expended_Percent_of_Awarded', 'Expended_Percent_of_Allocated'])

# Cleaning Allocation Sheet 

## Cleaning up PPNO, can only be 5 characters.

In [19]:
#remove the extra characters in PPNO in allocation to match the PPNO in project data frame bc there should only be five characters and numbers in each PPNO value
df_allocation = df_allocation.assign(
    PPNO_New = df_allocation['PPNO'].str.slice(start=0, stop=5)
)

In [20]:
#CSV with PPNO & Award Recipients
FILE_NAME2 = "Allocation_PPNO_Crosswalk.csv"
allocation_ppno = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME2}")

In [21]:
allocation_ppno #printing to make sure it makes sense.

Unnamed: 0,Award_Year,PPNO_New2,Award_Recipient
0,2020,CP065,Los Angeles County Metropolitan Transportation...
1,2020,CP066,Los Angeles-San Diego-San Luis Obispo Rail Cor...
2,2016,1230,San Bernardino County Transportation Authority...
3,2018,1155,Transportation Agency for Monterey County


In [22]:
#Merge in Crosswalk 
df_allocation = pd.merge(df_allocation, allocation_ppno, on = ["Award_Year", "Award_Recipient"], how = "left")

In [23]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_allocation.PPNO_New = df_allocation.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [24]:
#drop old column
df_allocation = df_allocation.drop(['PPNO'], axis=1)

In [25]:
df_allocation.columns

Index(['Expended_Amount', 'Award_Year', 'Award_Recipient',
       'Implementing_Agency', 'Allocation_Amount', 'GGRF_Funding', 'Phase',
       'LED', 'Allocation_Date', 'Completion_Date', '3rd_Party_Award_Date',
       'Components', 'SB1_Funding', 'PPNO_New', 'PPNO_New2'],
      dtype='object')

In [26]:
#renaming columns to something neater
df_allocation = df_allocation.rename(columns = {'Expended_Amount': 'Expended_ALLOCATION','PPNO_New':'PPNO', '3rd_Party_Award_Date':'Third_Party_Award_Date'})

In [27]:
#Filtering out for 2021, since that entry is blank
df_allocation = df_allocation.query("Award_Year != 2021")

## Cleaning up completion, allocation, 3rd Party dates, & LED dates

In [28]:
df_allocation.LED.unique().tolist()

[datetime.datetime(2022, 3, 31, 0, 0),
 datetime.datetime(2019, 6, 1, 0, 0),
 datetime.datetime(2020, 6, 28, 0, 0),
 datetime.datetime(2021, 6, 30, 0, 0),
 datetime.datetime(2019, 11, 3, 0, 0),
 datetime.datetime(2018, 11, 30, 0, 0),
 datetime.datetime(2020, 6, 30, 0, 0),
 datetime.datetime(2019, 1, 8, 0, 0),
 datetime.datetime(2018, 6, 30, 0, 0),
 datetime.datetime(2020, 6, 29, 0, 0),
 datetime.datetime(2019, 11, 1, 0, 0),
 datetime.datetime(2018, 12, 10, 0, 0),
 datetime.datetime(2021, 2, 2, 0, 0),
 datetime.datetime(2020, 6, 23, 0, 0),
 datetime.datetime(2021, 9, 30, 0, 0),
 'October 15, 2018\nSeptember 30, 2021',
 datetime.datetime(2020, 5, 16, 0, 0),
 datetime.datetime(2021, 1, 9, 0, 0),
 datetime.datetime(2020, 5, 23, 0, 0),
 datetime.datetime(2019, 6, 30, 0, 0),
 datetime.datetime(2024, 2, 28, 0, 0),
 datetime.datetime(2021, 2, 28, 0, 0),
 'TBD',
 datetime.datetime(2024, 6, 24, 0, 0),
 datetime.datetime(2022, 12, 30, 0, 0),
 datetime.datetime(2023, 6, 25, 0, 0),
 datetime.dateti

In [29]:
#changing some of the dates
df_allocation["Allocation_Date"].replace({'October 15, 2018\nSeptember 30, 2021': '2018-10-15 00:00:00',
 '2/1/2021\n\n10/31/2022':'2021-02-01 00:00:00', '45211':'2023-10-22'}, inplace =True)

In [30]:
#cleaning up allocation dates
df_allocation.Allocation_Date.unique().tolist()

[datetime.datetime(2015, 10, 22, 0, 0),
 datetime.datetime(2016, 5, 19, 0, 0),
 datetime.datetime(2016, 6, 30, 0, 0),
 datetime.datetime(2015, 12, 10, 0, 0),
 datetime.datetime(2015, 8, 27, 0, 0),
 datetime.datetime(2016, 1, 21, 0, 0),
 datetime.datetime(2017, 6, 29, 0, 0),
 datetime.datetime(2016, 10, 20, 0, 0),
 datetime.datetime(2017, 8, 17, 0, 0),
 datetime.datetime(2018, 1, 31, 0, 0),
 datetime.datetime(2017, 1, 19, 0, 0),
 datetime.datetime(2016, 3, 17, 0, 0),
 datetime.datetime(2017, 3, 16, 0, 0),
 datetime.datetime(2017, 5, 17, 0, 0),
 datetime.datetime(2018, 8, 16, 0, 0),
 'TBD',
 datetime.datetime(2021, 6, 24, 0, 0),
 datetime.datetime(2016, 12, 8, 0, 0),
 datetime.datetime(2020, 6, 25, 0, 0),
 datetime.datetime(2019, 12, 5, 0, 0),
 datetime.datetime(2018, 10, 18, 0, 0),
 datetime.datetime(2021, 1, 28, 0, 0),
 nan,
 datetime.datetime(2018, 2, 1, 0, 0),
 datetime.datetime(2018, 5, 17, 0, 0),
 'FY 26/27',
 datetime.datetime(2017, 5, 18, 0, 0),
 datetime.datetime(2018, 6, 28, 0,

In [31]:
#Had to change FY to an actual date 
df_allocation["Allocation_Date"].replace({"FY 26/27": "2026-12-31", "08/12//20": '2020-08-12 00:00:00', 'FY 21/22': '2021-12-31',
                                         'FY 22/23': '2022-12-31','FY 20/21': '2020-12-31', 'FY 23/24': '2023-12-31','FY 24/25': '2024-12-31','FY 25/26': '2025-12-31'}, inplace =True)

In [32]:
#clean up columns in a loop
for i in ["Allocation_Date", "Third_Party_Award_Date", "Completion_Date", "LED"]:
    df_allocation[i] = df_allocation[i].replace('/', '-', regex = True).replace('Complete', '', regex = True).replace('\n', '', regex=True).replace('Pending','TBD',regex= True).fillna('TBD')

In [33]:
df_allocation.Completion_Date.unique().tolist()

[datetime.datetime(2022, 3, 30, 0, 0),
 '6-1-2019',
 datetime.datetime(2021, 6, 30, 0, 0),
 datetime.datetime(2018, 9, 30, 0, 0),
 '2-11-2018',
 '6-30-2020',
 datetime.datetime(2020, 9, 30, 0, 0),
 ' 6-30-2018',
 '6-29-2020',
 '11-1-2019',
 ' 12-10-2018',
 ' 11-13-2019',
 '3-30-2020',
 datetime.datetime(2022, 9, 30, 0, 0),
 datetime.datetime(2021, 12, 30, 0, 0),
 datetime.datetime(2021, 9, 30, 0, 0),
 '5-16-2020',
 datetime.datetime(2024, 6, 30, 0, 0),
 'TBD',
 'June 24. 2024',
 datetime.datetime(2022, 12, 30, 0, 0),
 datetime.datetime(2024, 6, 24, 0, 0),
 '11-21-20247-30-2025 (Q4)',
 datetime.datetime(2022, 6, 30, 0, 0),
 datetime.datetime(2019, 5, 21, 0, 0),
 datetime.datetime(2024, 7, 25, 0, 0),
 datetime.datetime(2021, 12, 31, 0, 0),
 datetime.datetime(2024, 1, 28, 0, 0),
 datetime.datetime(2022, 10, 31, 0, 0),
 datetime.datetime(2022, 1, 16, 0, 0),
 datetime.datetime(2018, 2, 1, 0, 0),
 datetime.datetime(2022, 8, 22, 0, 0),
 datetime.datetime(2022, 7, 31, 0, 0),
 '5-7-2020',
 date

In [34]:
#cleaning up completion dates
df_allocation['Completion_Date'].replace({ 
    'June 24. 2024': '2024-06-01 00:00:00',  
    '11/21/2024\n7/30/2025 (Q4)': '2024-11-21 00:00:00', 
    'Jun-26': '2026-01-01 00:00:00', 
     'Jun-29': '2029-06-01 00:00:00',
    'Complete\n11/12/2019': '2019-11-12 00:00:00' , 
    'Deallocated': '', 
    'Jun-28': '2028-06-01 00:00:00',  
    'Jun-25': '2025-06-01 00:00:00', 
    'Jun-23':'2023-06-01 00:00:00', 
    'Jun-27': '2027-06-01 00:00:00',
    'Jan-25': '2025-01-01 00:00:00',
    '11-21-20247-30-2025 (Q4)':'2025-07-30 00:00:00',
    '6-30-202112-31-2021': '2021-12-31 00:00:00',
    '6-1-2019': '2019-06-01 00:00:00',
    '2-11-2018': '2018-02-11 00:00:00',
     '6-30-2020': '2020-06-30 00:00:00',
    ' 6-30-2018': '2018-06-30 00:00:00',
     '6-29-2020': '2020-06-29 00:00:00',
     '11-1-2019': '2019-11-01 00:00:00',
     ' 12-10-2018': '2018-12-10 00:00:00',
     ' 11-13-2019': '2019-11-13 00:00:00',
     '3-30-2020':'2020-03-30 00:00:00',
    ' 6-30-2020': '2020-06-30 00:00:00',
    '11-12-2019': '2019-11-12 00:00:00',
    '1-31-2020': '2020-01-31 00:00:00',
    '8-30-2020': '2020-08-30 00:00:00',
    '5-16-2020': '2020,05-16 00:00:00',
     '5-7-2020': '2020-05-07 00:00:00'}, inplace =True)

In [35]:
#cleaning up 3rd Party dates
df_allocation["Third_Party_Award_Date"].replace({ 
'Augsut 12, 2021': '2021-08-12 00:00:00',
'43435': '2018-12-01 00:00:00',
'07-29-2020': '2020-07-29 00:00:00',
'43497' : '2019-02-01 00:00:00',
'TBD 6-24-2021' : 'TBD',
'TBD 6-30-2022' : 'TBD'
}, inplace =True)

In [36]:
#coercing to dates
df_allocation = df_allocation.assign(
    Allocation_Date_New = pd.to_datetime(df_allocation.Allocation_Date, errors="coerce").dt.date,
    Third_Party_Award_Date_New = pd.to_datetime(df_allocation.Third_Party_Award_Date, errors="coerce").dt.date,
    Completion_Date_New = pd.to_datetime(df_allocation.Completion_Date, errors="coerce").dt.date,
    LED_New = pd.to_datetime(df_allocation.LED, errors="coerce").dt.date
)

In [37]:
#drop old columns
df_allocation = df_allocation.drop(['Allocation_Date','Third_Party_Award_Date','Completion_Date', 'LED'], axis=1)
#rename columns
df_allocation = df_allocation.rename(columns = {'Allocation_Amount':'Allocation_Allocation_Sheet', 'Allocation_Date_New':'Allocation_Date','Third_Party_Award_Date_New':'Third_Party_Award_Date',
                                               'Completion_Date_New': 'Completion_Date','LED_New': 'LED'})

In [38]:
df_allocation.head(2)

Unnamed: 0,Expended_ALLOCATION,Award_Year,Award_Recipient,Implementing_Agency,Allocation_Allocation_Sheet,GGRF_Funding,Phase,Components,SB1_Funding,PPNO,PPNO_New2,Allocation_Date,Third_Party_Award_Date,Completion_Date,LED
0,21714177.53,2015,Antelope Valley Transit Authority,Antelope Valley Transit Authority,24403000.0,24403000.0,CONST,Purchase 13 60-foot articulated BRT buses and ...,0.0,CP005,,2015-10-22,2016-03-14,2022-03-30,2022-03-31
1,4619999.9,2015,Capitol Corridor Joint Powers Authority,Capitol Corridor Joint Powers Authority,4620000.0,4620000.0,CONST,Track and curve improvements between San Jose ...,0.0,CP012,,2016-05-19,2016-06-01,2019-06-01,2019-06-01


## Cleaning up Expended Amount
* Have to divide expended by allocated amount, cannot divide by 0. 
* 'Deallocation' is changed to 0

In [39]:
df_allocation["Expended_ALLOCATION"].replace({'Deallocation': 0}, inplace=True)

In [40]:
df_allocation.Expended_ALLOCATION.unique().tolist()

[21714177.53,
 4619999.9,
 38494000.0,
 277840.47000000003,
 0.0,
 4000000.0,
 713000.0,
 27463000.0,
 15000.0,
 185000.0,
 6841000.0,
 37583067.0,
 1650000.0,
 5425722.49,
 2449000.0,
 172650.41,
 93803.62,
 1119881.34,
 3101000.0,
 415969.53,
 415969.27,
 49622714.33,
 11760285.669999998,
 4017000.0,
 600000.0,
 14102518.859999998,
 9204000.0,
 16750767.999999998,
 500000.0,
 610535.78,
 13492703.78,
 1726120.92,
 2584518.6599999997,
 57296.59,
 4705550.28,
 4838990.289999999,
 476432.37,
 157092.4,
 2299000.0,
 4012067.29,
 58994.67,
 11618.56,
 186423.28,
 56327785.99999999,
 2926950.8,
 243169.05000000002,
 1742773.01,
 9185571.96,
 20000000.0,
 15500000.0,
 666350.46,
 513117.39999999997,
 125705.22999999998,
 2875000.0,
 215500.0,
 11747351.6,
 170791.14,
 5965815.46,
 11506.73,
 4842.1,
 9846466.27,
 572.07,
 169541.03,
 3560346.85,
 80433.78,
 157969.19999999998,
 3870225.4600000004,
 1823462.51]

# Cleaning Project Sheet



## Filling NA for TIRCP and Expended Amounts

In [41]:
df_project[['TIRCP_Award_Amount_($)', 'Expended_Amount']] = df_project[['TIRCP_Award_Amount_($)', 'Expended_Amount']].fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [42]:
#checking for nas
df_project.isna().sum()

Award_Year                0
Project_#                 0
Local_Agency              0
Project_Title             0
PPNO                      5
Key_Project_Elements      0
TIRCP_Award_Amount_($)    0
Expended_Amount           0
Allocated_Amount          0
dtype: int64

## Cleaning up PPNO Numbers based on Allocation Sheet

In [43]:
#slicing PPNO to be 5 characters
df_project = df_project.assign(PPNO_New = df_project['PPNO'].str.slice(start=0, stop=5)) 

In [44]:
#importing Excel crosswalk sheet
FILE_NAME3 = "Projects_PPNO.xlsx"
project_ppno = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME3}")

In [45]:
#Merge in Crosswalk 
df_project2 = pd.merge(df_project, project_ppno, on = ["Award_Year", "Local_Agency"], how = "left")

In [46]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_project2.PPNO_New = df_project2.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [47]:
#making sure PPNO_New is a string 
df_project2 = df_project2.astype({'PPNO_New': 'str'})

In [48]:
PPNO_project = set(df_project2.PPNO_New.unique().tolist())
PPNO_allocation = set(df_allocation.PPNO.unique().tolist())

In [49]:
#checking for differences - none. yay. 
PPNO_project - PPNO_allocation

set()

In [50]:
df_project2.head(2)

Unnamed: 0,Award_Year,Project_#,Local_Agency,Project_Title,PPNO,Key_Project_Elements,TIRCP_Award_Amount_($),Expended_Amount,Allocated_Amount,PPNO_New,PPNO_New2
0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environme...,CP005,Purchase 13 60-foot articulated BRT buses and ...,24403000.0,21714177.53,24403000,CP005,
1,2015,2,Capitol Corridor Joint Powers Authority,Travel Time Reduction Project,CP012,Track and curve improvements between San Jose ...,4620000.0,4619999.9,4620000,CP012,


In [51]:
#drop old column
df_project2 = df_project2.drop(['PPNO', 'PPNO_New2'], axis=1)

In [52]:
#renaming to something neater
df_project2 = df_project2.rename(columns = {'PPNO_New':'PPNO', 'Allocated_Amount':'Allocation_Amount_PROJECT'})

In [53]:
df_allocation.head(2)

Unnamed: 0,Expended_ALLOCATION,Award_Year,Award_Recipient,Implementing_Agency,Allocation_Allocation_Sheet,GGRF_Funding,Phase,Components,SB1_Funding,PPNO,PPNO_New2,Allocation_Date,Third_Party_Award_Date,Completion_Date,LED
0,21714177.53,2015,Antelope Valley Transit Authority,Antelope Valley Transit Authority,24403000.0,24403000.0,CONST,Purchase 13 60-foot articulated BRT buses and ...,0.0,CP005,,2015-10-22,2016-03-14,2022-03-30,2022-03-31
1,4619999.9,2015,Capitol Corridor Joint Powers Authority,Capitol Corridor Joint Powers Authority,4620000.0,4620000.0,CONST,Track and curve improvements between San Jose ...,0.0,CP012,,2016-05-19,2016-06-01,2019-06-01,2019-06-01


# Merging Project & Allocations
* Merge on PPNO & Award_Year...using projects on the left? 
* Proper way to drop duplicates? 
* Use https://docs.python.org/3/library/uuid.html to identify each row or row number or grab .index and put it ina column.

In [54]:
#merge on left for projects.
df_combined = df_allocation.merge(df_project2, how = "left", on = ["PPNO", "Award_Year"])

In [55]:
df_combined.shape

(319, 22)

In [56]:
#dropping all duplicates...every column that is a duplicate is droped
df_combined2 = df_combined.drop_duplicates() 

In [57]:
df_combined2.shape

(295, 22)

In [58]:
df_combined.isna().sum()

Expended_ALLOCATION              0
Award_Year                       0
Award_Recipient                  0
Implementing_Agency              0
Allocation_Allocation_Sheet     10
GGRF_Funding                   122
Phase                            1
Components                       0
SB1_Funding                    120
PPNO                             0
PPNO_New2                      305
Allocation_Date                 90
Third_Party_Award_Date         154
Completion_Date                 90
LED                            179
Project_#                       33
Local_Agency                    33
Project_Title                   33
Key_Project_Elements            33
TIRCP_Award_Amount_($)          33
Expended_Amount                 33
Allocation_Amount_PROJECT       33
dtype: int64

### Filling in NA for monetary columns

In [59]:
#Just filling in NA values
df_combined2[['TIRCP_Award_Amount_($)','SB1_Funding']] = df_combined2[['TIRCP_Award_Amount_($)', 'SB1_Funding']].fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [60]:
#drop columns that aren't needed
df_combined2 = df_combined2.drop(columns=['Local_Agency','Key_Project_Elements'])

In [61]:
df_combined2.columns

Index(['Expended_ALLOCATION', 'Award_Year', 'Award_Recipient',
       'Implementing_Agency', 'Allocation_Allocation_Sheet', 'GGRF_Funding',
       'Phase', 'Components', 'SB1_Funding', 'PPNO', 'PPNO_New2',
       'Allocation_Date', 'Third_Party_Award_Date', 'Completion_Date', 'LED',
       'Project_#', 'Project_Title', 'TIRCP_Award_Amount_($)',
       'Expended_Amount', 'Allocation_Amount_PROJECT'],
      dtype='object')

### Calculate out Percent of Allocation Expended  & Percent of Award Fully Allocated 

In [62]:
df_combined2 = df_combined2.assign(
    Percent_of_Allocation_Expended = (df_combined2['Expended_ALLOCATION']/df_combined2['Allocation_Allocation_Sheet'])*100,
    Percent_of_Award_Fully_Allocated = (df_combined2['Allocation_Amount_PROJECT']/df_combined2['TIRCP_Award_Amount_($)'])*100)

In [63]:
cols = ['Expended_ALLOCATION','Allocation_Allocation_Sheet','TIRCP_Award_Amount_($)','Expended_Amount','GGRF_Funding','SB1_Funding','Percent_of_Allocation_Expended', 'Percent_of_Award_Fully_Allocated']

In [64]:
#coercing monetary to numeric 
df_combined2[cols] = df_combined2[cols].apply(pd.to_numeric, errors='coerce')

In [65]:
#Re name the columns exactly like the sheet
df_combined2 = df_combined2.rename(columns = {'LED': 'Phase_Completion_Date','SB1_Funding':'PTA-SB1 Allocation Amount',  'Percentge_Allocated': 'Percentage Allocated', 'TIRCP_Award_Amount_($)': 'TIRCP_Award_Amount',
'Third_Party_Award_Date':'CON_Contract_Award_Date'})

### Filing in NA dates with a super fake one
* 135 missing 3rd party award date
* 72 missing completion date
* 57 missing allocation date
* 145 missing phase completion date

In [66]:
#fill in missing dates with a fake one
missing_date = pd.to_datetime('2100-01-01')

In [67]:
#force to date time
df_combined2[['Allocation_Date', 'Completion_Date','Phase_Completion_Date', 'CON_Contract_Award_Date']] = df_combined2[['Allocation_Date', 'Completion_Date','Phase_Completion_Date', 'CON_Contract_Award_Date']].apply(pd.to_datetime)

### Create Allocated before July 2020 Date Column 
* If the allocation date is AFTER  7-31-2020 then 0, if BEFORE 7-31-2020 then 1

In [68]:
for i in ["Allocation_Date", "CON_Contract_Award_Date", "Completion_Date", "Phase_Completion_Date"]:
    df_combined2[i] = df_combined2[i].fillna(missing_date)

In [69]:
#if the allocation date is AFTER  7-31-2020 then 0, if BEFORE 7-31-2020 then 1
df_combined2 = df_combined2.assign( Allocated_Before_July_31_2020_1_is_yes = df_combined2.apply(lambda x: 0 if x.Allocation_Date > pd.Timestamp(2020, 7, 31, 0) else 1, axis=1))

# Mimic sheet


In [70]:
df_pivot = df_combined2.groupby(['Award_Year','Project_#','Award_Recipient','Project_Title','TIRCP_Award_Amount','Percent_of_Award_Fully_Allocated','Implementing_Agency', 'Components','PPNO','Phase',"Allocation_Date", 
 "CON_Contract_Award_Date", "Completion_Date", "Phase_Completion_Date", ]).agg({'Allocation_Allocation_Sheet': 'sum', 
'GGRF_Funding':'sum',
'Expended_ALLOCATION':'sum',
'PTA-SB1 Allocation Amount':'sum',
'Percent_of_Allocation_Expended':'max',                                                                                                               
'Allocated_Before_July_31_2020_1_is_yes':'max',
})

In [71]:
df_pivot.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Allocation_Allocation_Sheet,GGRF_Funding,Expended_ALLOCATION,PTA-SB1 Allocation Amount,Percent_of_Allocation_Expended,Allocated_Before_July_31_2020_1_is_yes
Award_Year,Project_#,Award_Recipient,Project_Title,TIRCP_Award_Amount,Percent_of_Award_Fully_Allocated,Implementing_Agency,Components,PPNO,Phase,Allocation_Date,CON_Contract_Award_Date,Completion_Date,Phase_Completion_Date,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020,14.0,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,10400000.0,27.88,Solano Transportation Authority,Access Improvements at Fairfield Transit Center,CP072,CONST,2021-12-31,2022-12-31,2025-03-01,2100-01-01,0.0,0.0,0.0,0.0,,0
2020,14.0,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,10400000.0,27.88,Solano Transportation Authority,Access Improvements at Fairfield Transit Center,CP072,PS&E,2021-05-13,2100-01-01,2025-06-30,2023-06-30,400000.0,200000.0,0.0,200000.0,0.0,0
2020,14.0,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,10400000.0,27.88,Solano Transportation Authority,Access Improvements at Fairfield-Vacaville Stations,CP072,CONST,2021-12-31,2022-12-31,2025-01-01,2100-01-01,400000.0,200000.0,0.0,200000.0,0.0,0
2020,14.0,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,10400000.0,27.88,Solano Transportation Authority,Access Improvements at Vacaville Transit Center,CP072,CONST,2021-12-31,2022-12-31,2025-01-01,2100-01-01,0.0,0.0,0.0,0.0,,0
2020,14.0,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,10400000.0,27.88,Solano Transportation Authority,Network Integration,CP072,CONST,2020-10-22,2022-06-30,2024-03-01,2100-01-01,1100000.0,0.0,0.0,1100000.0,0.0,0
2020,14.0,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,10400000.0,27.88,Solano Transportation Authority,Shared Inductive Charging Infrastructure,CP072,CONST,2021-12-31,2022-12-31,2025-01-01,2100-01-01,0.0,0.0,0.0,0.0,,0
2020,14.0,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,10400000.0,27.88,Solano Transportation Authority,Shared Inductive Charging Infrastructure,CP072,PS&E,2020-10-22,2100-01-01,2025-06-30,2100-01-01,1000000.0,500000.0,0.0,500000.0,0.0,0
2020,15.0,Torrance Transit Department,Torrance Transit Bus Service Enhancement Program,6000000.0,100.0,Torrance Transit Department,Procurement of 7 zero-emission buses,CP073,CONST,2020-10-22,2021-03-01,2021-12-31,2100-01-01,6000000.0,3000000.0,0.0,3000000.0,0.0,0
2020,16.0,Transit Joint Powers Authority for Merced County,"Improving Air Quality & Economic Growth with Electric Buses in Merced County, the Gateway to Yosemite",3112000.0,100.0,Transit Joint Powers Authority for Merced County,Procurement of 3 zero-emission buses,CP074,CONST,2020-10-22,2100-01-01,2026-01-31,2023-10-22,3112000.0,1556000.0,0.0,1556000.0,0.0,0
2020,17.0,San Francisco Bay Area Water Emergency Transportation Authority,Expansion of WETA Ferry Services,9060000.0,0.0,San Francisco Bay Area Water Emergency Transportation Authority,Acquisition of a new all-electric vessels and supporting infrastructure,CP075,CONST,2020-12-31,2021-06-01,2025-06-01,2100-01-01,0.0,0.0,0.0,0.0,,0


# Export into Excel
* Dataframes to export: summary_transposed and df_pivot
* https://www.geeksforgeeks.org/how-to-write-pandas-dataframes-to-multiple-excel-sheets/

In [72]:
#one final neaten up
df_pivot = df_pivot.rename(columns = {'Expended_ALLOCATION': 'Expended_Amount', 'Allocation_Allocation_Sheet': 'Allocation_Amount'})

In [74]:
with pd.ExcelWriter("./TIRCP_SAR_2022.xlsx") as writer:
    summary_transposed.to_excel(writer, sheet_name="Summary", index=True)
    df_pivot.to_excel(writer, sheet_name="FY", index=True)