# TIRCP SAR Report
----


In [1]:
import pandas as pd
import math
from siuba import * 
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.float_format = "{:.2f}".format
import datetime

In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"
FILE_NAME1 = "Raw_Project_Tracking_Sheet.xlsx"
project = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME1}")
FILE_NAME2 = "Allocation_Agreement.xlsx"
allocation = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME2}")

In [3]:
#cleaning up spaces in columns
project.columns = project.columns.str.strip().str.replace(' ', '_')
allocation.columns = allocation.columns.str.strip().str.replace(' ', '_')

## Cleaning columns

In [4]:
project.columns

Index(['Award_Year', 'Project_#', 'Local_Agency', 'Vendor_ID_#',
       'Project_Title', 'PPNO', 'District', 'County', 'Key_Project_Elements',
       'Master_Agreement_Number', 'Master_Agreement_Expiration_Date',
       'Project_Manager', 'Regional_Coordinator',
       'Technical_Assistance-CALTP_(Y/N)', 'Technical_Assistance-Fleet_(Y/N)',
       'Technical_Assistance-Network_Integration_(Y/N)',
       'Technical_Assistance-Priority_Population_(Y/N)', 'Total_Project_Cost',
       'TIRCP_Award_Amount_($)', 'Allocated_Amount', 'Unallocated_Amount',
       'Percentge_Allocated', 'Expended_Amount', 'Other_Funds_Involved',
       'Award_Cycle', 'Local_Agency_Address', 'Local_Agency_City',
       'Local_Agency_Zip', 'Local_Agency_Contact', 'Local_Agency_Email',
       'Local_Agency_Phone_Number', 'Comments/Additional_Contacts'],
      dtype='object')

In [5]:
#subsetting for only columns of interest
df_project = project[['Award_Year', 'Project_#','Local_Agency','Project_Title','PPNO',
'Key_Project_Elements','TIRCP_Award_Amount_($)','Expended_Amount','Allocated_Amount']]

In [6]:
#subsetting for only columns of interest
df_allocation = allocation[['Award_Year','Award_Recipient', 'Implementing_Agency', 'Allocation_Amount', 'GGRF_Funding','PPNO','Phase', 'LED','Allocation_Date','Completion_Date','3rd_Party_Award_Date','Components','SB1_Funding']]

In [7]:
#strip spaces in columns
df_project.columns = df_project.columns.map(lambda x: x.strip())
df_allocation.columns = df_allocation.columns.map(lambda x: x.strip())

# Make Summary Page
### Table 2: Summary of Awards (Cumulative)
* Linda stated she only needs table 2.
* Linda says she is unsure where to get the completed project row.
* The data from the Highlands project tracking sheet does not match the TIRCP SAR report Linda gave me 


In [8]:
summary = project.drop_duplicates().groupby(['Award_Year']).agg({'Project_#':'count','TIRCP_Award_Amount_($)':'sum', 
 'Allocated_Amount':'sum','Expended_Amount':'sum'}).reset_index()

In [9]:
summary['Expended_Percent_of_Awarded'] = summary['Expended_Amount']/summary['TIRCP_Award_Amount_($)']
summary['Expended_Percent_of_Allocated'] = summary['Expended_Amount']/summary['Allocated_Amount']
summary['Percent_Allocated'] = summary['Allocated_Amount']/summary['TIRCP_Award_Amount_($)']

In [10]:
summary_transposed = summary.set_index('Award_Year').T

In [11]:
#grand totals for monetary columns
list_to_add = ['TIRCP_Award_Amount_($)','Allocated_Amount','Expended_Amount', 'Project_#']

In [12]:
summary_transposed['Grand_Total']=summary_transposed.loc[list_to_add, :].sum(axis=1)

In [13]:
#grand total variables of eahc monetary column
Exp = summary_transposed.at['Expended_Amount','Grand_Total']
Alloc = summary_transposed.at['Allocated_Amount','Grand_Total']
TIRCP = summary_transposed.at['TIRCP_Award_Amount_($)','Grand_Total']

In [14]:
#filling in totals of percentages
summary_transposed.at['Expended_Percent_of_Awarded','Grand_Total'] = Exp/TIRCP
summary_transposed.at['Expended_Percent_of_Allocated','Grand_Total'] = Exp/Alloc
summary_transposed.at['Percent_Allocated','Grand_Total'] = Alloc/TIRCP

In [15]:
#switching rows to correct order
summary_transposed.reindex(['Project_#', 'TIRCP_Award_Amount_($)', 'Allocated_Amount','Percent_Allocated','Expended_Amount', 'Expended_Percent_of_Awarded', 'Expended_Percent_of_Allocated'])

Award_Year,2015,2016,2018,2020,Grand_Total
Project_#,14.0,15.0,28.0,17.0,74.0
TIRCP_Award_Amount_($),224328000.0,788444000.0,4251722000.0,500000000.0,5764494000.0
Allocated_Amount,224278000.0,422477816.0,1627802000.0,74030000.0,2348587816.0
Percent_Allocated,1.0,0.54,0.38,0.15,0.41
Expended_Amount,143556084.9,119517737.55,210456649.72,1823462.51,475353934.68
Expended_Percent_of_Awarded,0.64,0.15,0.05,0.0,0.08
Expended_Percent_of_Allocated,0.64,0.28,0.13,0.02,0.2


# Cleaning Allocation Sheet 

## Cleaning up PPNO, can only be 5 characters.

In [16]:
#remove the extra characters in PPNO in allocation to match the PPNO in project data frame bc there should only be five characters and numbers in each PPNO value
df_allocation = df_allocation.assign(
    PPNO_New = df_allocation['PPNO'].str.slice(start=0, stop=5)
)

In [17]:
#CSV with PPNO & Award Recipients
FILE_NAME2 = "Allocation_PPNO_Crosswalk.csv"
allocation_ppno = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME2}")

In [18]:
allocation_ppno #printing to make sure it makes sense.

Unnamed: 0,Award_Year,PPNO_New2,Award_Recipient
0,2020,CP065,Los Angeles County Metropolitan Transportation...
1,2020,CP066,Los Angeles-San Diego-San Luis Obispo Rail Cor...
2,2016,1230,San Bernardino County Transportation Authority...
3,2018,1155,Transportation Agency for Monterey County


In [19]:
#Filtering out for 2021, since that entry is blank
df_allocation = df_allocation.query("Award_Year != 2021")

In [20]:
#Merge in Crosswalk 
df_allocation = pd.merge(df_allocation, allocation_ppno, on = ["Award_Year", "Award_Recipient"], how = "left")

In [21]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_allocation.PPNO_New = df_allocation.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [22]:
#drop old column
df_allocation = df_allocation.drop(['PPNO'], axis=1)

In [23]:
#renaming columns to something neater
df_allocation = df_allocation.rename(columns = {'PPNO_New':'PPNO', '3rd_Party_Award_Date':'Third_Party_Award_Date'})

## Cleaning up completion, allocation, 3rd Party dates, & LED dates

In [24]:
df_allocation.LED.unique().tolist()

[datetime.datetime(2022, 3, 31, 0, 0),
 datetime.datetime(2019, 6, 1, 0, 0),
 datetime.datetime(2020, 6, 28, 0, 0),
 datetime.datetime(2021, 6, 30, 0, 0),
 datetime.datetime(2019, 11, 3, 0, 0),
 datetime.datetime(2018, 11, 30, 0, 0),
 datetime.datetime(2020, 6, 30, 0, 0),
 datetime.datetime(2019, 1, 8, 0, 0),
 datetime.datetime(2018, 6, 30, 0, 0),
 datetime.datetime(2020, 6, 29, 0, 0),
 datetime.datetime(2019, 11, 1, 0, 0),
 datetime.datetime(2018, 12, 10, 0, 0),
 datetime.datetime(2021, 2, 2, 0, 0),
 datetime.datetime(2020, 6, 23, 0, 0),
 datetime.datetime(2021, 9, 30, 0, 0),
 'October 15, 2018\nSeptember 30, 2021',
 datetime.datetime(2020, 5, 16, 0, 0),
 datetime.datetime(2021, 1, 9, 0, 0),
 datetime.datetime(2020, 5, 23, 0, 0),
 datetime.datetime(2019, 6, 30, 0, 0),
 datetime.datetime(2024, 2, 28, 0, 0),
 datetime.datetime(2021, 2, 28, 0, 0),
 'TBD',
 datetime.datetime(2024, 6, 24, 0, 0),
 datetime.datetime(2022, 12, 30, 0, 0),
 datetime.datetime(2023, 6, 25, 0, 0),
 datetime.dateti

In [25]:
#changing some of the dates
df_allocation["Allocation_Date"].replace({'October 15, 2018\nSeptember 30, 2021': '2018-10-15 00:00:00',
 '2/1/2021\n\n10/31/2022':'2021-02-01 00:00:00', '45211':'2023-10-22'}, inplace =True)

In [26]:
#cleaning up allocation dates
df_allocation.Allocation_Date.unique().tolist()

[datetime.datetime(2015, 10, 22, 0, 0),
 datetime.datetime(2016, 5, 19, 0, 0),
 datetime.datetime(2016, 6, 30, 0, 0),
 datetime.datetime(2015, 12, 10, 0, 0),
 datetime.datetime(2015, 8, 27, 0, 0),
 datetime.datetime(2016, 1, 21, 0, 0),
 datetime.datetime(2017, 6, 29, 0, 0),
 datetime.datetime(2016, 10, 20, 0, 0),
 datetime.datetime(2017, 8, 17, 0, 0),
 datetime.datetime(2018, 1, 31, 0, 0),
 datetime.datetime(2017, 1, 19, 0, 0),
 datetime.datetime(2016, 3, 17, 0, 0),
 datetime.datetime(2017, 3, 16, 0, 0),
 datetime.datetime(2017, 5, 17, 0, 0),
 datetime.datetime(2018, 8, 16, 0, 0),
 'TBD',
 datetime.datetime(2021, 6, 24, 0, 0),
 datetime.datetime(2016, 12, 8, 0, 0),
 datetime.datetime(2020, 6, 25, 0, 0),
 datetime.datetime(2019, 12, 5, 0, 0),
 datetime.datetime(2018, 10, 18, 0, 0),
 datetime.datetime(2021, 1, 28, 0, 0),
 nan,
 datetime.datetime(2018, 2, 1, 0, 0),
 datetime.datetime(2018, 5, 17, 0, 0),
 'FY 26/27',
 datetime.datetime(2017, 5, 18, 0, 0),
 datetime.datetime(2018, 6, 28, 0,

In [27]:
#Had to change FY to an actual date 
df_allocation["Allocation_Date"].replace({"FY 26/27": "2026-12-31", "08/12//20": '2020-08-12 00:00:00', 'FY 21/22': '2021-12-31',
                                         'FY 22/23': '2022-12-31','FY 20/21': '2020-12-31', 'FY 23/24': '2023-12-31','FY 24/25': '2024-12-31','FY 25/26': '2025-12-31'}, inplace =True)

In [28]:
#clean up columns in a loop
for i in ["Allocation_Date", "Third_Party_Award_Date", "Completion_Date", "LED"]:
    df_allocation[i] = df_allocation[i].replace('/', '-', regex = True).replace('Complete', '', regex = True).replace('\n', '', regex=True).replace('Pending','TBD',regex= True).fillna('TBD')

In [29]:
df_allocation.Completion_Date.unique().tolist()

[datetime.datetime(2022, 3, 30, 0, 0),
 '6-1-2019',
 datetime.datetime(2021, 6, 30, 0, 0),
 datetime.datetime(2018, 9, 30, 0, 0),
 '2-11-2018',
 '6-30-2020',
 datetime.datetime(2020, 9, 30, 0, 0),
 ' 6-30-2018',
 '6-29-2020',
 '11-1-2019',
 ' 12-10-2018',
 ' 11-13-2019',
 '3-30-2020',
 datetime.datetime(2022, 9, 30, 0, 0),
 datetime.datetime(2021, 12, 30, 0, 0),
 datetime.datetime(2021, 9, 30, 0, 0),
 '5-16-2020',
 datetime.datetime(2024, 6, 30, 0, 0),
 'TBD',
 'June 24. 2024',
 datetime.datetime(2022, 12, 30, 0, 0),
 datetime.datetime(2024, 6, 24, 0, 0),
 '11-21-20247-30-2025 (Q4)',
 datetime.datetime(2022, 6, 30, 0, 0),
 datetime.datetime(2019, 5, 21, 0, 0),
 datetime.datetime(2024, 7, 25, 0, 0),
 datetime.datetime(2021, 12, 31, 0, 0),
 datetime.datetime(2024, 1, 28, 0, 0),
 datetime.datetime(2022, 10, 31, 0, 0),
 datetime.datetime(2022, 1, 16, 0, 0),
 datetime.datetime(2018, 2, 1, 0, 0),
 datetime.datetime(2022, 8, 22, 0, 0),
 datetime.datetime(2022, 7, 31, 0, 0),
 '5-7-2020',
 date

In [30]:
#cleaning up completion dates
df_allocation['Completion_Date'].replace({ 
    'June 24. 2024': '2024-06-01 00:00:00',  
    '11/21/2024\n7/30/2025 (Q4)': '2024-11-21 00:00:00', 
    'Jun-26': '2026-01-01 00:00:00', 
     'Jun-29': '2029-06-01 00:00:00',
    'Complete\n11/12/2019': '2019-11-12 00:00:00' , 
    'Deallocated': '', 
    'Jun-28': '2028-06-01 00:00:00',  
    'Jun-25': '2025-06-01 00:00:00', 
    'Jun-23':'2023-06-01 00:00:00', 
    'Jun-27': '2027-06-01 00:00:00',
    'Jan-25': '2025-01-01 00:00:00',
    '11-21-20247-30-2025 (Q4)':'2025-07-30 00:00:00',
    '6-30-202112-31-2021': '2021-12-31 00:00:00',
    '6-1-2019': '2019-06-01 00:00:00',
    '2-11-2018': '2018-02-11 00:00:00',
     '6-30-2020': '2020-06-30 00:00:00',
    ' 6-30-2018': '2018-06-30 00:00:00',
     '6-29-2020': '2020-06-29 00:00:00',
     '11-1-2019': '2019-11-01 00:00:00',
     ' 12-10-2018': '2018-12-10 00:00:00',
     ' 11-13-2019': '2019-11-13 00:00:00',
     '3-30-2020':'2020-03-30 00:00:00',
    ' 6-30-2020': '2020-06-30 00:00:00',
    '11-12-2019': '2019-11-12 00:00:00',
    '1-31-2020': '2020-01-31 00:00:00',
    '8-30-2020': '2020-08-30 00:00:00',
    '5-16-2020': '2020,05-16 00:00:00',
     '5-7-2020': '2020-05-07 00:00:00'}, inplace =True)

In [31]:
#cleaning up 3rd Party dates
df_allocation["Third_Party_Award_Date"].replace({ 
'Augsut 12, 2021': '2021-08-12 00:00:00',
'43435': '2018-12-01 00:00:00',
'07-29-2020': '2020-07-29 00:00:00',
'43497' : '2019-02-01 00:00:00',
'TBD 6-24-2021' : 'TBD',
'TBD 6-30-2022' : 'TBD'
}, inplace =True)

In [32]:
#coercing to dates
df_allocation = df_allocation.assign(
    Allocation_Date_New = pd.to_datetime(df_allocation.Allocation_Date, errors="coerce").dt.date,
    Third_Party_Award_Date_New = pd.to_datetime(df_allocation.Third_Party_Award_Date, errors="coerce").dt.date,
    Completion_Date_New = pd.to_datetime(df_allocation.Completion_Date, errors="coerce").dt.date,
    LED_New = pd.to_datetime(df_allocation.LED, errors="coerce").dt.date
)

In [33]:
#drop old columns
df_allocation = df_allocation.drop(['PPNO_New2','Allocation_Date','Third_Party_Award_Date','Completion_Date', 'LED'], axis=1)
#rename columns
df_allocation = df_allocation.rename(columns = {'Allocation_Amount':'Allocation_Allocation_Sheet', 'Allocation_Date_New':'Allocation_Date','Third_Party_Award_Date_New':'Third_Party_Award_Date',
                                               'Completion_Date_New': 'Completion_Date','LED_New': 'LED'})

In [34]:
#just checking
df_allocation[(df_allocation.PPNO.str.contains("CP008", case= False))]

Unnamed: 0,Award_Year,Award_Recipient,Implementing_Agency,Allocation_Allocation_Sheet,GGRF_Funding,Phase,Components,SB1_Funding,PPNO,Allocation_Date,Third_Party_Award_Date,Completion_Date,LED
8,2015,San Diego Metropolitan Transit System (MTS),San Diego Metropolitan Transit System (MTS),713000.0,713000.0,PS&E,Courthouse Trolley Station (Design),0.0,CP008,2016-01-21,NaT,2018-06-30,2018-06-30
9,2015,San Diego Metropolitan Transit System (MTS),San Diego Metropolitan Transit System (MTS),3760000.0,3760000.0,CONST,Courthouse Trolley Station (Construction),0.0,CP008,2017-06-29,2017-05-30,2020-06-29,2020-06-29
10,2015,San Diego Metropolitan Transit System (MTS),San Diego Metropolitan Transit System (MTS),27463000.0,27463000.0,CONST,Purchase 8 trolley vehicles to expand service ...,0.0,CP008,2016-10-20,2016-11-01,2019-11-01,2019-11-01


# Cleaning Project Sheet



## Filling NA for TIRCP and Expended Amounts

In [35]:
df_project[['TIRCP_Award_Amount_($)', 'Expended_Amount']] = df_project[['TIRCP_Award_Amount_($)', 'Expended_Amount']].fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [36]:
#checking for nas
df_project.isna().sum()

Award_Year                0
Project_#                 0
Local_Agency              0
Project_Title             0
PPNO                      5
Key_Project_Elements      0
TIRCP_Award_Amount_($)    0
Expended_Amount           0
Allocated_Amount          0
dtype: int64

## Cleaning up PPNO Numbers based on Allocation Sheet

In [37]:
#slicing PPNO to be 5 characters
df_project = df_project.assign(PPNO_New = df_project['PPNO'].str.slice(start=0, stop=5)) 

In [38]:
#importing Excel crosswalk sheet
FILE_NAME3 = "Projects_PPNO.xlsx"
project_ppno = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME3}")

In [39]:
#Merge in Crosswalk 
df_project2 = pd.merge(df_project, project_ppno, on = ["Award_Year", "Local_Agency"], how = "left")

In [40]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_project2.PPNO_New = df_project2.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [41]:
#making sure PPNO_New is a string 
df_project2 = df_project2.astype({'PPNO_New': 'str'})

In [42]:
PPNO_project = set(df_project2.PPNO_New.unique().tolist())
PPNO_allocation = set(df_allocation.PPNO.unique().tolist())

In [43]:
#checking for differences - none. yay. 
PPNO_project - PPNO_allocation

set()

In [44]:
df_project2.head(2)

Unnamed: 0,Award_Year,Project_#,Local_Agency,Project_Title,PPNO,Key_Project_Elements,TIRCP_Award_Amount_($),Expended_Amount,Allocated_Amount,PPNO_New,PPNO_New2
0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environme...,CP005,Purchase 13 60-foot articulated BRT buses and ...,24403000.0,21714177.53,24403000,CP005,
1,2015,2,Capitol Corridor Joint Powers Authority,Travel Time Reduction Project,CP012,Track and curve improvements between San Jose ...,4620000.0,4619999.9,4620000,CP012,


In [45]:
#drop old column
df_project2 = df_project2.drop(['PPNO', 'PPNO_New2'], axis=1)

In [46]:
#renaming to something neater
df_project2 = df_project2.rename(columns = {'PPNO_New':'PPNO', 'Allocated_Amount':'Allocation_Amount_PROJECT'})

In [47]:
df_allocation.head(2)

Unnamed: 0,Award_Year,Award_Recipient,Implementing_Agency,Allocation_Allocation_Sheet,GGRF_Funding,Phase,Components,SB1_Funding,PPNO,Allocation_Date,Third_Party_Award_Date,Completion_Date,LED
0,2015,Antelope Valley Transit Authority,Antelope Valley Transit Authority,24403000.0,24403000.0,CONST,Purchase 13 60-foot articulated BRT buses and ...,0.0,CP005,2015-10-22,2016-03-14,2022-03-30,2022-03-31
1,2015,Capitol Corridor Joint Powers Authority,Capitol Corridor Joint Powers Authority,4620000.0,4620000.0,CONST,Track and curve improvements between San Jose ...,0.0,CP012,2016-05-19,2016-06-01,2019-06-01,2019-06-01


# Merging Project & Allocations
* Merge on PPNO & Award_Year
* Proper way to drop duplicates
* Use https://docs.python.org/3/library/uuid.html to identify each row or row number or grab .index and put it ina column.

In [48]:
#merge on left for projects.
df_combined = df_project2.merge(df_allocation, how = "left", on = ["PPNO", "Award_Year"])

In [49]:
df_combined.shape

(286, 20)

In [50]:
df_combined2 = df_combined.drop_duplicates() #how should I drop duplicates?

In [51]:
df_combined2.shape

(266, 20)

In [52]:
df_combined2.head(2)

Unnamed: 0,Award_Year,Project_#,Local_Agency,Project_Title,Key_Project_Elements,TIRCP_Award_Amount_($),Expended_Amount,Allocation_Amount_PROJECT,PPNO,Award_Recipient,Implementing_Agency,Allocation_Allocation_Sheet,GGRF_Funding,Phase,Components,SB1_Funding,Allocation_Date,Third_Party_Award_Date,Completion_Date,LED
0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environme...,Purchase 13 60-foot articulated BRT buses and ...,24403000.0,21714177.53,24403000,CP005,Antelope Valley Transit Authority,Antelope Valley Transit Authority,24403000.0,24403000.0,CONST,Purchase 13 60-foot articulated BRT buses and ...,0.0,2015-10-22,2016-03-14,2022-03-30,2022-03-31
1,2015,2,Capitol Corridor Joint Powers Authority,Travel Time Reduction Project,Track and curve improvements between San Jose ...,4620000.0,4619999.9,4620000,CP012,Capitol Corridor Joint Powers Authority,Capitol Corridor Joint Powers Authority,4620000.0,4620000.0,CONST,Track and curve improvements between San Jose ...,0.0,2016-05-19,2016-06-01,2019-06-01,2019-06-01


In [53]:
#Just filling in NA values
df_combined2[['TIRCP_Award_Amount_($)', 'Allocation_Allocation_Sheet','Expended_Amount','SB1_Funding']] = df_combined2[['TIRCP_Award_Amount_($)', 'Allocation_Allocation_Sheet', 'Expended_Amount','SB1_Funding']].fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [54]:
#drop columns that aren't needed
df_combined2 = df_combined2.drop(columns=['Local_Agency','Key_Project_Elements'])

In [55]:
#Re name the columns exactly like the sheet
df_combined2 = df_combined2.rename(columns = {'LED': 'Phase_Completion_Date','SB1_Funding':'PTA-SB1 Allocation Amount',  'Percentge_Allocated': 'Percentage Allocated', 'TIRCP_Award_Amount_($)': 'TIRCP_Award_Amount'})

### Create Allocated before July 2020 Date Column 
* If the allocation date is AFTER  7-31-2020 then 0, if BEFORE 7-31-2020 then 1

In [56]:
#force to date time
df_combined2[['Allocation_Date', 'Completion_Date','Phase_Completion_Date', 'Third_Party_Award_Date']] = df_combined2[['Allocation_Date', 'Completion_Date','Phase_Completion_Date', 'Third_Party_Award_Date']].apply(pd.to_datetime)

In [57]:
#if the allocation date is AFTER  7-31-2020 then 0, if BEFORE 7-31-2020 then 1
df_combined2 = df_combined2.assign( Allocated_Before_July_2020_1_is_yes = df_combined2.apply(lambda x: 0 if x.Allocation_Date > pd.Timestamp(2020, 7, 31, 0) else 1, axis=1))

### Calculate out Percent of Allocation Expended  & Percent of Award Fully Allocated 

In [58]:
df_combined2 = df_combined2.assign(
    Percent_of_Allocation_Expended = df_combined2['Expended_Amount']/df_combined2['Allocation_Allocation_Sheet'],
    Percent_of_Award_Fully_Allocated = df_combined2['Allocation_Amount_PROJECT']/df_combined2['TIRCP_Award_Amount']) 

### Keep first occurence of TIRCP & Percent of Award Fully Allocated 
* Maybe delete this portion

In [59]:
#create new column that counts each PPNO observation
df_combined2 = df_combined2.assign(
    PPNO_observation = df_combined2.groupby(["PPNO"]).cumcount() + 1
)

In [60]:
#Deleting the extra TIRCP & percent of award fully allocated
df_combined2.TIRCP_Award_Amount = df_combined2.apply(lambda x: x.TIRCP_Award_Amount == np.nan if x.PPNO_observation > 1 else x.TIRCP_Award_Amount, axis=1)      
df_combined2.Percent_of_Award_Fully_Allocated = df_combined2.apply(lambda x: x.Percent_of_Award_Fully_Allocated == np.nan if x.PPNO_observation > 1 else x.Percent_of_Award_Fully_Allocated, axis=1)   

In [61]:
#clean up columns in a loop
for i in ["TIRCP_Award_Amount", "Percent_of_Award_Fully_Allocated"]:
    df_combined2[i] = df_combined2[i].replace(False, '0')

In [62]:
cols = ['TIRCP_Award_Amount','Expended_Amount', 'Allocation_Amount_PROJECT', 'Allocation_Allocation_Sheet','GGRF_Funding', 'PTA-SB1 Allocation Amount','Percent_of_Allocation_Expended', 'Percent_of_Award_Fully_Allocated']

In [63]:
#coercing some to numeric 
df_combined2[cols] = df_combined2[cols].apply(pd.to_numeric, errors='coerce')

In [64]:
df_combined2.isna().sum()

Award_Year                               0
Project_#                                0
Project_Title                            0
TIRCP_Award_Amount                       0
Expended_Amount                          0
Allocation_Amount_PROJECT                0
PPNO                                     0
Award_Recipient                          0
Implementing_Agency                      0
Allocation_Allocation_Sheet              0
GGRF_Funding                            89
Phase                                    1
Components                               0
PTA-SB1 Allocation Amount                0
Allocation_Date                         57
Third_Party_Award_Date                 135
Completion_Date                         72
Phase_Completion_Date                  145
Allocated_Before_July_2020_1_is_yes      0
Percent_of_Allocation_Expended          47
Percent_of_Award_Fully_Allocated         1
PPNO_observation                         0
dtype: int64

### Filing in NA dates with a super fake one
* 135 missing 3rd party award date
* 72 missing completion date
* 57 missing allocation date
* 145 missing phase completion date

In [65]:
#fill in missing dates with a fake one
missing_date = pd.to_datetime('2100-01-01')

In [66]:
for i in ["Allocation_Date", "Third_Party_Award_Date", "Completion_Date", "Phase_Completion_Date"]:
    df_combined2[i] = df_combined2[i].fillna(missing_date)

In [67]:
#checking with Solano Transportation Authority in 2020
df_combined2.loc[df_combined2['PPNO'] == 'CP072'] 

Unnamed: 0,Award_Year,Project_#,Project_Title,TIRCP_Award_Amount,Expended_Amount,Allocation_Amount_PROJECT,PPNO,Award_Recipient,Implementing_Agency,Allocation_Allocation_Sheet,GGRF_Funding,Phase,Components,PTA-SB1 Allocation Amount,Allocation_Date,Third_Party_Award_Date,Completion_Date,Phase_Completion_Date,Allocated_Before_July_2020_1_is_yes,Percent_of_Allocation_Expended,Percent_of_Award_Fully_Allocated,PPNO_observation
276,2020,14,Solano Regional Transit Improvements Phase 2,10400000.0,0.0,2900000,CP072,Solano Transportation Authority,Solano Transportation Authority,400000.0,200000.0,PS&E,Access Improvements at Fairfield Transit Center,200000.0,2021-05-13,2100-01-01,2025-06-30,2023-06-30,0,0.0,0.28,1
277,2020,14,Solano Regional Transit Improvements Phase 2,0.0,0.0,2900000,CP072,Solano Transportation Authority,Solano Transportation Authority,0.0,,CONST,Access Improvements at Fairfield Transit Center,0.0,2021-12-31,2022-12-31,2025-03-01,2100-01-01,0,,0.0,2
278,2020,14,Solano Regional Transit Improvements Phase 2,0.0,0.0,2900000,CP072,Solano Transportation Authority,Solano Transportation Authority,0.0,,CONST,Access Improvements at Vacaville Transit Center,0.0,2021-12-31,2022-12-31,2025-01-01,2100-01-01,0,,0.0,3
279,2020,14,Solano Regional Transit Improvements Phase 2,0.0,0.0,2900000,CP072,Solano Transportation Authority,Solano Transportation Authority,1000000.0,500000.0,PS&E,Shared Inductive Charging Infrastructure,500000.0,2020-10-22,2100-01-01,2025-06-30,2100-01-01,0,0.0,0.0,4
280,2020,14,Solano Regional Transit Improvements Phase 2,0.0,0.0,2900000,CP072,Solano Transportation Authority,Solano Transportation Authority,0.0,,CONST,Shared Inductive Charging Infrastructure,0.0,2021-12-31,2022-12-31,2025-01-01,2100-01-01,0,,0.0,5
281,2020,14,Solano Regional Transit Improvements Phase 2,0.0,0.0,2900000,CP072,Solano Transportation Authority,Solano Transportation Authority,400000.0,200000.0,CONST,Access Improvements at Fairfield-Vacaville Sta...,200000.0,2021-12-31,2022-12-31,2025-01-01,2100-01-01,0,0.0,0.0,6
282,2020,14,Solano Regional Transit Improvements Phase 2,0.0,0.0,2900000,CP072,Solano Transportation Authority,Solano Transportation Authority,1100000.0,,CONST,Network Integration,1100000.0,2020-10-22,2022-06-30,2024-03-01,2100-01-01,0,0.0,0.0,7


# Mimic sheet
* Rewrite column TIRCP and Percent of Award Fully Allocated to indicate they have to merge.
* Do the same thing with Percent of Award Fully Located
* Use new column to pivot. 
* use merge to attach everything else back on.

In [78]:
df_pivot = df_combined2.groupby(['Award_Year','PPNO','Award_Recipient','Implementing_Agency','Project_Title','Phase', 'Components',"Allocation_Date", 
                                 "Third_Party_Award_Date", "Completion_Date", "Phase_Completion_Date", "TIRCP_Amount" ]).agg(
    {'Percent_of_Award_Fully_Allocated':'sum',
'TIRCP_Award_Amount':'max',
'Allocation_Allocation_Sheet': 'sum', 
'GGRF_Funding':'sum',
'Expended_Amount':'sum',
'PTA-SB1 Allocation Amount':'sum',
'Allocated_Before_July_2020_1_is_yes':'max',
'Percent_of_Allocation_Expended':'max'})

In [80]:
df_pivot.tail(10)
#.to_excel("./test_excel.xlsx")

# Export into Excel
* Dataframes to export: summary_transposed and df_pivot
* https://www.geeksforgeeks.org/how-to-write-pandas-dataframes-to-multiple-excel-sheets/

with pd.ExcelWriter("./TIRCP_SAR_2022.xlsx") as writer:
    summary_transposed.to_excel(writer, sheet_name="Summary", index=True)
    df_pivot.to_excel(writer, sheet_name="FY", index=True)