# Cleaning & improving old work
* Recieved a new workbook July 8, 2022.
* Create separate functions to import the actual sheets

In [1]:
import numpy as np
import pandas as pd

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from calitp import *

#Formatting the nb 
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

#Import script
import data_prep



In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"
FILE_NAME = "TIRCP_July_8_2022.xlsx"

In [3]:
#Open up the 3 sheets 
project = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Project Tracking"))
allocation =  to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Agreement Allocations"))
invoice = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Invoice Tracking Sheet"))



## Functions

In [4]:
#Some PPNO numbers are 5+. Slice them down to <= 5.
def ppno_slice(df):
    df = df.assign(ppno = df['ppno'].str.slice(start=0, stop=5))
    return df 

In [5]:
allocation = ppno_slice(allocation)
project = ppno_slice(project)

## Check PPNO 

In [6]:
PPNO_project = set(project.ppno.unique().tolist())
PPNO_allocation = set(allocation.ppno.unique().tolist())

In [7]:
#Only 3 differing PPNO numbers
PPNO_project - PPNO_allocation 

{'1155A', 'CP000', 'CP052'}

In [8]:
differences = list(PPNO_allocation - PPNO_project)
f'{len(differences)} different PPNOS.'

'13 different PPNOS.'

In [9]:
#Create a subset of allocation df with only the PPNOS that differ between the Allocation and Projects sheet
different_dfs = allocation[allocation.ppno.isin(differences)]

In [10]:
#Subset df for only rows of interest to find out which grant recipients & award years have different PPNOS
allocation_sub =different_dfs[['award_year', 'ppno','grant_recipient']].drop_duplicates()

In [11]:
allocation_sub

Unnamed: 0,award_year,ppno,grant_recipient
15,2015.0,CP002,Southern California Regional Rail Authority (Metrolink)
17,2016.0,CP018,Antelope Valley Transit Authority
33,2016.0,CP024,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
35,2016.0,CP021,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
53,2016.0,CPO02,San Joaquin Regional Rail Commission
94,2018.0,CP301,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
110,2018.0,CP042,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
146,2018.0,CP053,Sacramento Regional Transit District
158,2018.0,CP032,San Diego Metropolitan Transit System (MTS)
299,2018.0,1155N,Transportation Agency for Monterey County


In [12]:
#get a subset of project_sub 
#project[['award_year','ppno','grant_recipient']].drop_duplicates()

### Use projects dataframe's PPNO as a source of truth
* Need to change Antelope Valley Transit Authority (AVTA) 2020's PPNO.  

In [13]:
#Crosswalk for allocation df 
ppno_crosswalk_allocation = {'CP018': 'CP019','CP024':'CP043',
                             'CP021':'CP043', 'CPO02':'CP025',
                             'CP301': 'CP031', 'CP042':'CP031',
                             'CP053': 'CP052','CP032': 'CP034',
                             '1155N':'1155A',}
allocation['ppno'].replace(ppno_crosswalk_allocation, inplace= True)                           

In [14]:
#No need for a crosswalk for projects dataframe. For some reason some read in as NaN. 
project.loc[(project["grant_recipient"] == "San Bernardino County Transportation Authority (SBCTA)"), "ppno"] = '1230'
project.loc[(project["grant_recipient"] == "Bay Area Rapid Transit District (BART)"), "ppno"] = 'CP060'
project.loc[(project["grant_recipient"] == "Santa Monica Big Blue Bus"), "ppno"] = 'CP071'

### Double check the sets

In [15]:
PPNO_project = set(project.ppno.unique().tolist())
PPNO_allocation = set(allocation.ppno.unique().tolist())

In [16]:
PPNO_allocation - PPNO_project

{'CP002', 'CP059'}

## Project

In [17]:
project.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 41 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   award_year                                                 73 non-null     int64  
 1   project_#                                                  73 non-null     int64  
 2   grant_recipient                                            73 non-null     object 
 3   project_title                                              73 non-null     object 
 4   ppno                                                       72 non-null     object 
 5   district                                                   70 non-null     object 
 6   county                                                     72 non-null     object 
 7   project_description                                        73 non-null     object 
 8   master_agree

In [18]:
project[['award_year','ppno','grant_recipient','district', 'county',
         'tircp_award_amount__$_', 'allocated_amount', 'unallocated_amount',
       'percentage_allocated', 'expended_amount', 'other_funds_involved',
       'award_cycle', 'estimated_tircp_ghg_reductions',
       'cost_per_ghg_ton_reduced', 'increased_ridership',
       'service_integration', 'improve_safety', 'project_readiness',
       'funding_leverage', 'multi_agency_coordination_integration',
       'priority_population_benefits___ab_1550_community_benefits',
       'housing_co_benefits']].head(2)


Unnamed: 0,award_year,ppno,grant_recipient,district,county,tircp_award_amount__$_,allocated_amount,unallocated_amount,percentage_allocated,expended_amount,other_funds_involved,award_cycle,estimated_tircp_ghg_reductions,cost_per_ghg_ton_reduced,increased_ridership,service_integration,improve_safety,project_readiness,funding_leverage,multi_agency_coordination_integration,priority_population_benefits___ab_1550_community_benefits,housing_co_benefits
0,2015,CP005,Antelope Valley Transit Authority (AVTA),7,LA,24403000,24403000,0,1.0,21714177.53,,1,"195,380 tons",,,,,,,,,
1,2015,CP012,Capitol Corridor Joint Powers Authority,4,VAR,4620000,4620000,0,1.0,4619999.9,,1,"356,667 tons",,,,,,,,,


In [19]:
#Fill in nulls based on data type
project.fillna(project.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64':0}), inplace=True)

#Fill in FY 
project["award_cycle"].replace({'FY 21/22': 4}, inplace=True)

In [20]:
project['award_year'].value_counts()

2018    28
2020    17
2015    14
2016    14
Name: award_year, dtype: int64

In [21]:
project['award_cycle'].value_counts()

3    28
4    17
1    14
2    14
Name: award_cycle, dtype: int64

In [22]:
project['percentage_allocated'].value_counts()

1.00    39
0.00    11
0.93     1
0.91     1
0.06     1
0.02     1
0.21     1
0.18     1
0.05     1
0.49     1
0.86     1
0.12     1
0.19     1
1.76     1
0.38     1
2.00     1
0.84     1
0.41     1
0.99     1
0.24     1
0.68     1
1.84     1
1.09     1
0.90     1
0.24     1
Name: percentage_allocated, dtype: int64

In [23]:
test_projects = data_prep.clean_project()



## Allocation Agreement 

In [24]:
#Some rows are not completed: drop them
allocation = allocation.dropna(subset=['award_year', 'grant_recipient', 'ppno'])
len(allocation)

338

In [25]:
#Replacing values  
allocation_3rd_party_date = {'07/29/2020': '2020-07-29 00:00:00'}
allocation_led = {'2/1/2021\n\n10/31/2022': '2021-02-01 00:00:00',
                 'June 30, 2019\nSeptember 30, 2019': '2019 06-30 00:00:00',
                 'October 15, 2018\nSeptember 30, 2021': '2018-10-15 00:00:00'}
allocation_completion_date = {'6/30/2021\n12/31/2021\n10/20/2022':'2021-06-30 00:00:00',
                              'Complete\n8/30/2020':'2020-08-30 00:00:00',
                              'Complete\n1/31/2020':'2021-01-31 00:00:00'}



In [26]:
#allocation['_3rd_party_award_date'].value_counts()

In [27]:
#allocation['led'].value_counts()

In [28]:
#allocation['allocation_date'].value_counts()

In [29]:
#allocation['completion_date'].value_counts()

In [30]:
#allocation['date_branch_chief_receives_psa'].value_counts()

In [31]:
#allocation['expended_amount'].value_counts()

In [32]:
allocation["expended_amount"] = (allocation["expended_amount"]
                             .replace({'Deallocation': 0})
                             .astype('int64')
                            )

In [33]:
#Coerce dates to datetime
date_columns = ['allocation_date', 'completion_date','_3rd_party_award_date', 'led', 'date_branch_chief_receives_psa',
       'date_regional_coordinator_receives_psa', 'date_oc_receives_psa',
       'date_opm_receives_psa', 'date_legal_receives_psa',
       'date_returned_to_pm',
       'date_psa_approved_by_local_agency', 'date_signed_by_drmt',
       'psa_expiry_date']

In [34]:
#Fill in NA based on data type
allocation.fillna(allocation.dtypes.replace({'float64': 0.0, 'object': 'None'}), inplace=True)

In [35]:
#Coerce date columns
#https://sparkbyexamples.com/pandas/pandas-convert-multiple-columns-to-datetime-type/
for c in date_columns:
        allocation[c] = allocation[c].apply(pd.to_datetime, errors='coerce')

In [36]:
allocation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 338 entries, 0 to 345
Data columns (total 50 columns):
 #   Column                                                       Non-Null Count  Dtype         
---  ------                                                       --------------  -----         
 0   award_year                                                   338 non-null    float64       
 1   project_#                                                    338 non-null    float64       
 2   grant_recipient                                              338 non-null    object        
 3   implementing_agency                                          338 non-null    object        
 4   ppno                                                         338 non-null    object        
 5   project_id                                                   338 non-null    object        
 6   ea                                                           338 non-null    object        
 7   components       

In [37]:
test_allocation = data_prep.load_allocation()

In [39]:
test_allocation.head(2)

Unnamed: 0,award_year,project_#,grant_recipient,implementing_agency,ppno,project_id,ea,components,phase,allocation_amount,expended_amount,sb1_funding,sb1_budget_year,ggrf_funding,ggrf_budget_year,ctc_financial_resolution,ctc_allocation_amendment,ctc_waiver,calsta_waiver,allocation_date,completion_date,psa_#,ct_document_#,_3rd_party_award_date,led,date_branch_chief_receives_psa,date_regional_coordinator_receives_psa,date_oc_receives_psa,date_opm_receives_psa,date_legal_receives_psa,date_returned_to_pm,date_psa_sent_to_local_agency,date_psa_approved_by_local_agency,date_signed_by_drmt,psa_expiry_date,lonp,prior_fiscal_years_to_2020,fiscal_year_2020_2021,fiscal_year_2021_2022,fiscal_year_2022_2023,fiscal_year_2023_2024,fiscal_year_2024_2025,fiscal_year_2025_2026,fiscal_year_2026_2027,fiscal_year_2027_2028,fiscal_year_2028_2029,fiscal_year_2029_2030,allocation_comments,non_network_integration_allocations_unique_percentage_split,psa_comments
0,2015.0,1.0,Antelope Valley Transit Authority,Antelope Valley Transit Authority,CP005,16000048,T343GA,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,CONST,24403000.0,21714177.53,0.0,,24403000.0,2015-16,TIRCP-1516-02,,,Waiver-1920-17,2015-10-22,2022-03-30 00:00:00,07AVTA2015PS-01 A1 \n\n07AVTA2015PS-05,07AVTA2015PS\n*Listed under Unit 3040,2016-03-14 00:00:00,2022-03-31 00:00:00,NaT,NaT,,,,,,NaT,2021-02-02,NaT,,24403000.0,,,,,,,,,,,"Program Supplement be Amended to show a correction in the invoicing section and to include language that confirms funding for "" supporting infrastructure"" includes WAVE.",,
1,2015.0,2.0,Capitol Corridor Joint Powers Authority,Capitol Corridor Joint Powers Authority,CP012,16000276,R350GA,Track and curve improvements between San Jose and Martinez for faster journeys benefiting Capitol Corridor passengers,CONST,4620000.0,4619999.9,4620000.0,2015-16,0.0,2012-13,TIRCP-1516-07\nTech. Correction June 2017,TIRCP-1920-17A\n6/25/2020,,,2016-05-19,Complete\n6/1/2019,VARCCJPAPS-01\n,VARCCJPAPS-01,2016-06-01 00:00:00,2019-06-01 00:00:00,NaT,NaT,,,,,,NaT,2016-12-13,2019-06-01,,4620000.0,,,,,,,,,,,,,
