# Cleaning up Grants Data
----

There are three sheets in the TIRCP workbook. However, it seems like the summary reports mostly rely on two: one called project tracking and one called allocation tracking, so I am only loading those in.


In [None]:
import pandas as pd
import math
!pip install openpyxl

In [None]:
#read in files 
project = pd.read_excel(open('Raw Project Tracking Sheet.xlsx','rb'), sheet_name='Project Tracking DRAFT')  
allocation = pd.read_excel(open('Raw Project Tracking Sheet.xlsx','rb'), sheet_name='Agreement Allocations DRAFT')  


In [None]:
#cleaning up spaces in columns
project.columns = project.columns.str.strip().str.replace(' ', '_')
allocation.columns = allocation.columns.str.strip().str.replace(' ', '_')

In [None]:
(project.columns)

In [None]:
(allocation.columns)

### Note: Third party award date is called CON Contract Award Date in TIRCP SAR Attachment 

In [None]:
#subsetting for only columns of interest
df_project = project[['Award_Year', 'Project_#','Local_Agency','Project_Title','PPNO',
                      'Key_Project_Elements','TIRCP_Award_Amount_($)','Allocated_Amount','Expended_Amount']]

In [None]:
#subsetting for only columns of interest
df_allocation = allocation[['Award_Year','Award_Recipient', 'Implementing_Agency', 'PPNO','Phase',
                            'Allocation_Date','Completion_Date','3rd_Party_Award_Date']]

In [None]:
#strip spaces in columns
df_project.columns = df_project.columns.map(lambda x: x.strip())
df_allocation.columns = df_allocation.columns.map(lambda x: x.strip())

In [None]:
#just testing to make sure everything looks okay...
df_project.to_csv("./test_df_project.csv")
df_allocation.to_csv("./test_df_allocation.csv")

# Cleaning Allocation Sheet 

## Cleaning up PPNO, can only be 5 characters.

In [None]:
#remove the extra characters in PPNO in allocation to match the PPNO in project data frame bc there should only be five characters and numbers in each PPNO value
df_allocation = df_allocation.assign(
    PPNO_New = df_allocation['PPNO'].str.slice(start=0, stop=5)
)

In [None]:
#CSV with PPNO & Award Recipients
allocation_ppno = pd.read_csv('Allocation_PPNO_Crosswalk.csv')

In [None]:
allocation_ppno 

In [None]:
#Filtering out for 2021, since that entry is blank
df_allocation = df_allocation.query("Award_Year != 2021")

In [None]:
#Merge in Crosswalk 
df_allocation = pd.merge(df_allocation, allocation_ppno, on = ["Award_Year", "Award_Recipient"], how = "left")

In [None]:
type(df_allocation.iloc[0].PPNO_New2)


In [None]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_allocation.PPNO_New = df_allocation.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [None]:
#drop old column
df_allocation = df_allocation.drop(['PPNO'], axis=1)

In [None]:
#renaming columns to something neater
df_allocation = df_allocation.rename(columns = {'PPNO_New':'PPNO', '3rd_Party_Award_Date':'Third_Party_Award_Date'})

## Cleaning up completion, allocation, & 3rd Party dates 

In [None]:
#cleaning up allocation dates
df_allocation.Allocation_Date.unique().tolist()

In [None]:
#Had to change FY to an actual date 
df_allocation["Allocation_Date"].replace({"FY 26/27": "2026-12-31", "08/12//20": '2020-08-12 00:00:00', 'FY 21/22': '2021-12-31',
                                         'FY 22/23': '2022-12-31','FY 20/21': '2020-12-31', 'FY 23/24': '2023-12-31','FY 24/25': '2024-12-31','FY 25/26': '2025-12-31'}, inplace =True)

In [None]:
#clean up columns in a loop
for i in ["Allocation_Date", "Third_Party_Award_Date", "Completion_Date"]:
    df_allocation[i] = df_allocation[i].replace('/', '-', regex = True).replace('Complete', '', regex = True).replace('\n', '', regex=True).replace('Pending','TBD',regex= True).fillna('TBD')

In [None]:
df_allocation.Completion_Date.unique().tolist()

In [None]:
#cleaning up completion dates
df_allocation["Completion_Date"].replace({ 
    'June 24. 2024': '2024-06-01 00:00:00',  
    '11/21/2024\n7/30/2025 (Q4)': '2024-11-21 00:00:00', 
    'Jun-26': '2026-01-01 00:00:00', 
     'Jun-29': '2029-06-01 00:00:00',
    'Complete\n11/12/2019': '2019-11-12 00:00:00' , 
    'Deallocated': '', 
    'Jun-28': '2028-06-01 00:00:00',  
    'Jun-25': '2025-06-01 00:00:00', 
    'Jun-23':'2023-06-01 00:00:00', 
    'Jun-27': '2027-06-01 00:00:00',
    'Jan-25': '2025-01-01 00:00:00',
    '11-21-20247-30-2025 (Q4)':'2025-07-30 00:00:00',
    '6-30-202112-31-2021': '2021-12-31 00:00:00',
    '6-1-2019': '2019-06-01 00:00:00',
    '2-11-2018': '2018-02-11 00:00:00',
     '6-30-2020': '2020-06-30 00:00:00',
    ' 6-30-2018': '2018-06-30 00:00:00',
     '6-29-2020': '2020-06-29 00:00:00',
     '11-1-2019': '2019-11-01 00:00:00',
     ' 12-10-2018': '2018-12-10 00:00:00',
     ' 11-13-2019': '2019-11-13 00:00:00',
     '3-30-2020':'2020-03-30 00:00:00',
    ' 6-30-2020': '2020-06-30 00:00:00',
    '11-12-2019': '2019-11-12 00:00:00',
    '1-31-2020': '2020-01-31 00:00:00',
    '8-30-2020': '2020-08-30 00:00:00',
    '5-16-2020': '2020,05-16 00:00:00',
     '5-7-2020': '2020-05-07 00:00:00'}, inplace =True)

In [None]:
#cleaning up 3rd Party dates
df_allocation["Third_Party_Award_Date"].replace({ 
'Augsut 12, 2021': '2021-08-12 00:00:00',
'43435': '2018-12-01 00:00:00',
'07-29-2020': '2020-07-29 00:00:00',
'43497' : '2019-02-01 00:00:00',
'TBD 6-24-2021' : 'TBD',
'TBD 6-30-2022' : 'TBD'
}, inplace =True)

In [None]:
df_allocation = df_allocation.assign(
    Allocation_Date_New = pd.to_datetime(df_allocation.Allocation_Date, errors="coerce").dt.date,
    Third_Party_Award_Date_New = pd.to_datetime(df_allocation.Third_Party_Award_Date, errors="coerce").dt.date,
    Completion_Date_New = pd.to_datetime(df_allocation.Completion_Date, errors="coerce").dt.date
)

In [None]:
df_allocation.dtypes

In [None]:
#checking for nas
df_allocation.isna().sum()

In [None]:
#df_allocation.to_csv("./test_df_allocation.csv") just a line to test 

## Final CSV Version

In [None]:
#drop old columns
df_allocation = df_allocation.drop(['PPNO_New2','Allocation_Date','Completion_Date','Third_Party_Award_Date'], axis=1)
#rename columns
df_allocation = df_allocation.rename(columns = {'Allocation_Date_New':'Allocation_Date', 'Completion_Date_New':'Completion_Date', 'Third_Party_Award_Date_New':'Third_Party_Award_Date'})

In [None]:
df_allocation.to_csv("./final_df_allocation.csv", index= False)

# Cleaning Project Sheet



In [None]:
df_project.columns

## Filling NA for TIRCP, Allocated, and Expended Amounts

In [None]:
df_project[['TIRCP_Award_Amount_($)', 'Allocated_Amount','Expended_Amount']] = df_project[['TIRCP_Award_Amount_($)', 'Allocated_Amount', 'Expended_Amount']].fillna(value=0)

## Cleaning up PPNO Numbers based on Allocation Sheet

In [None]:
#slicing PPNO to be 5 characters
df_project = df_project.assign(PPNO_New = df_project['PPNO'].str.slice(start=0, stop=5)) 

In [None]:
#importing Excel crosswalk sheet
project_ppno = pd.read_excel(open('Projects_PPNO.xlsx','rb')) 

In [None]:
#Merge in Crosswalk 
df_project2 = pd.merge(df_project, project_ppno, on = ["Award_Year", "Local_Agency"], how = "left")

In [None]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_project2.PPNO_New = df_project2.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [None]:
#making sure PPNO_New is a string 
df_project2 = df_project2.astype({'PPNO_New': 'str'})

In [None]:
PPNO_project = set(df_project2.PPNO_New.unique().tolist())
PPNO_allocation = set(df_allocation.PPNO.unique().tolist())

In [None]:
PPNO_project #looking at list

In [None]:
PPNO_allocation #looking at list

In [None]:
PPNO_project - PPNO_allocation #checking for differences

## Cleaning up Award Recipients
- Matching up Award Recipient in allocation sheet & Local Agency in projects sheet
    - Using Allocation as the "source of truth" data set for the agencies who received the awards 
    - [Stack Overflow Ref](https://stackoverflow.com/questions/61811137/based-on-partial-string-match-fill-one-data-frame-column-from-another-dataframe)
    

In [None]:
#make sure these are strings
df_project2 = df_project2.astype({'Local_Agency': 'str'})
df_allocation = df_allocation.astype({'Award_Recipient': 'str'})


In [None]:
#using allocation as the "source of truth" data set for the agencies who received the awards 
allocation_agencies = df_allocation.Award_Recipient.unique().tolist()
allocation_agencies

In [None]:
#stack overflow method
df_project2['Award_Recipient'] = df_project2['Local_Agency'].apply(lambda x: ''.join([part for part in allocation_agencies if part in x]))

In [None]:
df_project2.head(2)

In [None]:
# Dictionary to continue correcting Award Recipients based on PPNO values
rename_recipients = df_project2[["Award_Recipient", "PPNO_New"]].sort_values(["Award_Recipient", "PPNO_New"]).drop_duplicates(subset=["PPNO_New"])


In [None]:
rename_recipients = rename_recipients.set_index('PPNO_New').T.to_dict('list')

In [None]:
rename_recipients

In [None]:
#updating dictionary to address blanks 
rename_recipients.update({'CP002': 'Southern California Regional Rail Authority (Metrolink)',
 'CP006': 'San Francisco Municipal Transportation Agency',
 'CP013': 'Monterey-Salinas Transit',
 'CP025': 'San Joaquin Regional Rail Commission',
 'CP026': 'San Joaquin Regional Rail Commission',
 'CP055': 'Bay Area Rapid Transit District',
 'CP065': 'Los Angeles County Metropolitan Transportation Authority and Southern California Regional Rail Authority',
 'CP071': 'City of Santa Monica',
 'CP074': 'Transit Joint Powers Authority for Merced County',
 'CP019': 'Antelope Valley Transit Authority & Long Beach Transit',
 'CP068': 'San Bernardino County Transportation Authority (SBCTA) & Omnitrans'})

In [None]:
df_project2 = df_project2.assign(
    Award_Recipient = df_project2.PPNO_New.map(rename_recipients))

In [None]:
#creating a dictionary for the values that didn't populate
crosswalk = {'Monterey-Salinas Transit': 'Monterey-Salinas Transit',
             'San Francisco Municipal  Transportation Agency': 'San Francisco Municipal Transportation Agency',
            'Southern California  Regional Rail Authority': 'Southern California Regional Rail Authority',
            'San Joaquin Regional Rail Commission / San Joaquin Joint Powers Authority':  'San Joaquin Joint Powers Authority and San Joaquin Regional Rail Commission',
            'Bay Area Rapid Transit (BART)': 'Bay Area Rapid Transit District',
            'LA County Metropolitan Transportation Authority, So Cal Regional Rail Authority (Metrolink)':  'Los Angeles County Metropolitan Transportation Authority and Southern California Regional Rail Authority',
            'Santa Monica Big Blue Bus':  'City of Santa Monica',
            'Transit Joint Powers Authority of Merced County':  'Transit Joint Powers Authority for Merced County',
             'Los Angeles County Metropolitan Transportation AuthorityLos Angeles County Metropolitan Transportation Authority': 'Los Angeles County Metropolitan Transportation Authority ',
             'Antelope Valley Transit AuthorityAntelope Valley Transit Authority': 'Antelope Valley Transit Authority ',
             'Sacramento Regional Transit DistrictSacramento Regional Transit District ': 'Sacramento Regional Transit District',
             'Capitol Corridor Joint Powers AuthorityCapitol Corridor Joint Powers Authority  ': 'Capitol Corridor Joint Powers Authority',
             'San Joaquin Regional\nRail Commission / San Joaquin Joint Powers Authority': 'San Joaquin Joint Powers Authority and San Joaquin Regional Rail Commission'
}

In [None]:
#populating again with the crosswalk
df_project2 = df_project2.replace(crosswalk)

In [None]:
#double checking
df_project2.sample(5)

In [None]:
#df_project2.to_csv("./test_df_project.csv") just a line to check the data frame 

## Final Version

In [None]:
#drop old column
df_project2 = df_project2.drop(['PPNO','PPNO_New2'], axis=1)

In [None]:
#renaming PPNO 
df_project2 = df_project2.rename(columns = {'PPNO_New':'PPNO'})

In [None]:
df_project2.to_csv("./final_df_project.csv", index= False)