# Cleaning up Grants Data
----

There are three sheets in the TIRCP workbook. However, it seems like the summary reports mostly rely on two: one called project tracking and one called allocation tracking, so I am only loading those in.


In [1]:
import pandas as pd
import math
!pip install openpyxl



In [2]:
#read in files 
project = pd.read_excel(open('Raw Project Tracking Sheet.xlsx','rb'), sheet_name='Project Tracking DRAFT')  
allocation = pd.read_excel(open('Raw Project Tracking Sheet.xlsx','rb'), sheet_name='Agreement Allocations DRAFT')  


In [3]:
#cleaning up spaces in columns
project.columns = project.columns.str.strip().str.replace(' ', '_')
allocation.columns = allocation.columns.str.strip().str.replace(' ', '_')

In [4]:
(project.columns)

Index(['Award_Year', 'Project_#', 'Local_Agency', 'Vendor_ID_#',
       'Project_Title', 'PPNO', 'District', 'County', 'Key_Project_Elements',
       'Master_Agreement_Number', 'Master_Agreement_Expiration_Date',
       'Project_Manager', 'Regional_Coordinator',
       'Technical_Assistance-CALTP_(Y/N)', 'Technical_Assistance-Fleet_(Y/N)',
       'Technical_Assistance-Network_Integration_(Y/N)',
       'Technical_Assistance-Priority_Population_(Y/N)', 'Total_Project_Cost',
       'TIRCP_Award_Amount_($)', 'Allocated_Amount', 'Unallocated_Amount',
       'Percentge_Allocated', 'Expended_Amount', 'Other_Funds_Involved',
       'Award_Cycle', 'Local_Agency_Address', 'Local_Agency_City',
       'Local_Agency_Zip', 'Local_Agency_Contact', 'Local_Agency_Email',
       'Local_Agency_Phone_Number', 'Comments/Additional_Contacts'],
      dtype='object')

In [5]:
(allocation.columns)

Index(['Award_Year', 'Project_#', 'Award_Recipient', 'Implementing_Agency',
       'PPNO', 'Project_ID', 'EA', 'Components', 'Phase', 'Allocation_Amount',
       'Expended_Amount', 'SB1_Funding', 'SB1_Budget_Year', 'GGRF_Funding',
       'GGRF_Budget_Year', 'CTC_Financial_Resolution',
       'CTC_Allocation_Amendment', 'CTC_Waiver', 'CTC_CalSTA_Waiver',
       'Allocation_Date', 'Completion_Date', 'PSA_#', 'CT_Document_#',
       '3rd_Party_Award_Date', 'LED', 'Date_Branch_Chief_Receives_PSA',
       'Date_Regional_Coordinator_Receives_PSA', 'Date_OC_Receives_PSA',
       'Date_OPM_Receives_PSA', 'Date_Legal_Receives_PSA',
       'Date_Returned_to_PM', 'Date_PSA_Sent_to_Local_Agency',
       'Date_PSA_Approved_by_Local_Agency', 'Date_Signed_by_DRMT',
       'PSA_Expiry_Date', 'LONP', 'Prior_Fiscal_Years_to_2020',
       'Fiscal_Year_2020-2021', 'Fiscal_Year_2021-2022',
       'Fiscal_Year_2022-2023', 'Fiscal_Year_2023-2024',
       'Fiscal_Year_2024-2025', 'Fiscal_Year_2025-2026',
    

### Note: Third party award date is called CON Contract Award Date in TIRCP SAR Attachment 

In [6]:
#subsetting for only columns of interest
df_project = project[['Award_Year', 'Project_#','Local_Agency','Project_Title','PPNO',
                      'Key_Project_Elements','TIRCP_Award_Amount_($)','Allocated_Amount','Expended_Amount']]

In [7]:
df_allocation = allocation[['Award_Year','Award_Recipient', 'Implementing_Agency', 'PPNO','Phase',
                            'Allocation_Date','Completion_Date','3rd_Party_Award_Date']]

In [8]:
#strip spaces in columns
df_project.columns = df_project.columns.map(lambda x: x.strip())
df_allocation.columns = df_allocation.columns.map(lambda x: x.strip())

In [9]:
#just testing to make sure everything looks okay...
df_project.to_csv("./test_df_project.csv")
df_allocation.to_csv("./test_df_allocation.csv")

# Cleaning Allocation Sheet 

## Cleaning up PPNO, can only be 5 characters.

In [10]:
#remove the extra characters in PPNO in allocation to match the PPNO in project data frame bc there should only be five characters and numbers in each PPNO value
df_allocation = df_allocation.assign(
    PPNO_New = df_allocation['PPNO'].str.slice(start=0, stop=5)
)

In [11]:
#CSV with PPNO & Award Recipients
allocation_ppno = pd.read_csv('Allocation_PPNO_Crosswalk.csv')

In [12]:
allocation_ppno 

Unnamed: 0,Award_Year,PPNO_New2,Award_Recipient
0,2020,CP065,Los Angeles County Metropolitan Transportation...
1,2020,CP066,Los Angeles-San Diego-San Luis Obispo Rail Cor...
2,2016,1230,San Bernardino County Transportation Authority...
3,2018,1155,Transportation Agency for Monterey County


In [13]:
#Filtering out for 2021, since that entry is blank
df_allocation = df_allocation.query("Award_Year != 2021")

In [14]:
#Merge in Crosswalk 
df_allocation = pd.merge(df_allocation, allocation_ppno, on = ["Award_Year", "Award_Recipient"], how = "left")

In [15]:
type(df_allocation.iloc[0].PPNO_New2)


float

In [16]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_allocation.PPNO_New = df_allocation.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [17]:
#drop old column
df_allocation = df_allocation.drop(['PPNO'], axis=1)

In [18]:
#renaming columns to something neater
df_allocation = df_allocation.rename(columns = {'PPNO_New':'PPNO', '3rd_Party_Award_Date':'Third_Party_Award_Date'})

In [19]:
df_allocation.to_csv("./test_df_allocation.csv")

## Cleaning up completion, allocation, & 3rd Party dates 

In [20]:
#cleaning up allocation dates
df_allocation.Allocation_Date.unique().tolist()

[datetime.datetime(2015, 10, 22, 0, 0),
 datetime.datetime(2016, 5, 19, 0, 0),
 datetime.datetime(2016, 6, 30, 0, 0),
 datetime.datetime(2015, 12, 10, 0, 0),
 datetime.datetime(2015, 8, 27, 0, 0),
 datetime.datetime(2016, 1, 21, 0, 0),
 datetime.datetime(2017, 6, 29, 0, 0),
 datetime.datetime(2016, 10, 20, 0, 0),
 datetime.datetime(2017, 8, 17, 0, 0),
 datetime.datetime(2018, 1, 31, 0, 0),
 datetime.datetime(2017, 1, 19, 0, 0),
 datetime.datetime(2016, 3, 17, 0, 0),
 datetime.datetime(2017, 3, 16, 0, 0),
 datetime.datetime(2017, 5, 17, 0, 0),
 datetime.datetime(2018, 8, 16, 0, 0),
 'TBD',
 datetime.datetime(2021, 6, 24, 0, 0),
 datetime.datetime(2016, 12, 8, 0, 0),
 datetime.datetime(2020, 6, 25, 0, 0),
 datetime.datetime(2019, 12, 5, 0, 0),
 datetime.datetime(2018, 10, 18, 0, 0),
 datetime.datetime(2021, 1, 28, 0, 0),
 nan,
 datetime.datetime(2018, 2, 1, 0, 0),
 datetime.datetime(2018, 5, 17, 0, 0),
 'FY 26/27',
 datetime.datetime(2017, 5, 18, 0, 0),
 datetime.datetime(2018, 6, 28, 0,

In [21]:
#Had to change FY to an actual date 
df_allocation["Allocation_Date"].replace({"FY 26/27": "2026-12-31", "08/12//20": '2020-08-12 00:00:00', 'FY 21/22': '2021-12-31',
                                         'FY 22/23': '2022-12-31','FY 20/21': '2020-12-31', 'FY 23/24': '2023-12-31','FY 24/25': '2024-12-31','FY 25/26': '2025-12-31'}, inplace =True)

In [22]:
#clean up columns in a loop
for i in ["Allocation_Date", "Third_Party_Award_Date", "Completion_Date"]:
    df_allocation[i] = df_allocation[i].replace('/', '-', regex = True).replace('Complete', '', regex = True).replace('\n', '', regex=True).replace('Pending','TBD',regex= True).fillna('TBD')

In [23]:
df_allocation.Completion_Date.unique().tolist()

[datetime.datetime(2022, 3, 30, 0, 0),
 '6-1-2019',
 datetime.datetime(2021, 6, 30, 0, 0),
 datetime.datetime(2018, 9, 30, 0, 0),
 '2-11-2018',
 '6-30-2020',
 datetime.datetime(2020, 9, 30, 0, 0),
 ' 6-30-2018',
 '6-29-2020',
 '11-1-2019',
 ' 12-10-2018',
 ' 11-13-2019',
 '3-30-2020',
 datetime.datetime(2022, 9, 30, 0, 0),
 datetime.datetime(2021, 12, 30, 0, 0),
 datetime.datetime(2021, 9, 30, 0, 0),
 '5-16-2020',
 datetime.datetime(2024, 6, 30, 0, 0),
 'TBD',
 'June 24. 2024',
 datetime.datetime(2022, 12, 30, 0, 0),
 datetime.datetime(2024, 6, 24, 0, 0),
 '11-21-20247-30-2025 (Q4)',
 datetime.datetime(2022, 6, 30, 0, 0),
 datetime.datetime(2019, 5, 21, 0, 0),
 datetime.datetime(2024, 7, 25, 0, 0),
 datetime.datetime(2021, 12, 31, 0, 0),
 datetime.datetime(2024, 1, 28, 0, 0),
 datetime.datetime(2022, 10, 31, 0, 0),
 datetime.datetime(2022, 1, 16, 0, 0),
 datetime.datetime(2018, 2, 1, 0, 0),
 datetime.datetime(2022, 8, 22, 0, 0),
 datetime.datetime(2022, 7, 31, 0, 0),
 '5-7-2020',
 date

In [24]:
#cleaning up completion dates
df_allocation["Completion_Date"].replace({ 
    'June 24. 2024': '2024-06-01 00:00:00',  
    '11/21/2024\n7/30/2025 (Q4)': '2024-11-21 00:00:00', 
    'Jun-26': '2026-01-01 00:00:00', 
     'Jun-29': '2029-06-01 00:00:00',
    'Complete\n11/12/2019': '2019-11-12 00:00:00' , 
    'Deallocated': '', 
    'Jun-28': '2028-06-01 00:00:00',  
    'Jun-25': '2025-06-01 00:00:00', 
    'Jun-23':'2023-06-01 00:00:00', 
    'Jun-27': '2027-06-01 00:00:00',
    'Jan-25': '2025-01-01 00:00:00',
    '11-21-20247-30-2025 (Q4)':'2025-07-30 00:00:00',
    '6-30-202112-31-2021': '2021-12-31 00:00:00',
    '6-1-2019': '2019-06-01 00:00:00',
    '2-11-2018': '2018-02-11 00:00:00',
     '6-30-2020': '2020-06-30 00:00:00',
    ' 6-30-2018': '2018-06-30 00:00:00',
     '6-29-2020': '2020-06-29 00:00:00',
     '11-1-2019': '2019-11-01 00:00:00',
     ' 12-10-2018': '2018-12-10 00:00:00',
     ' 11-13-2019': '2019-11-13 00:00:00',
     '3-30-2020':'2020-03-30 00:00:00',
    ' 6-30-2020': '2020-06-30 00:00:00',
    '11-12-2019': '2019-11-12 00:00:00',
    '1-31-2020': '2020-01-31 00:00:00',
    '8-30-2020': '2020-08-30 00:00:00',
    '5-16-2020': '2020,05-16 00:00:00',
     '5-7-2020': '2020-05-07 00:00:00'}, inplace =True)

In [25]:
#cleaning up 3rd Party dates
df_allocation["Third_Party_Award_Date"].replace({ 
'Augsut 12, 2021': '2021-08-12 00:00:00',
'43435': '2018-12-01 00:00:00',
'07-29-2020': '2020-07-29 00:00:00',
'43497' : '2019-02-01 00:00:00',
'TBD 6-24-2021' : 'TBD',
'TBD 6-30-2022' : 'TBD'
}, inplace =True)

In [26]:
df_allocation = df_allocation.assign(
    Third_Party_Award_Date_New = pd.to_datetime(df_allocation.Third_Party_Award_Date, errors="coerce").dt.date,
    Allocation_Date_New = pd.to_datetime(df_allocation.Allocation_Date, errors="coerce").dt.date,
    Completion_Date_New = pd.to_datetime(df_allocation.Completion_Date, errors="coerce").dt.date
)

In [27]:
df_allocation.dtypes

Award_Year                     int64
Award_Recipient               object
Implementing_Agency           object
Phase                         object
Allocation_Date               object
Completion_Date               object
Third_Party_Award_Date        object
PPNO                          object
PPNO_New2                     object
Third_Party_Award_Date_New    object
Allocation_Date_New           object
Completion_Date_New           object
dtype: object

In [28]:
#checking for nas
df_allocation.isna().sum()

Award_Year                      0
Award_Recipient                 0
Implementing_Agency             0
Phase                           1
Allocation_Date                 0
Completion_Date                 0
Third_Party_Award_Date          0
PPNO                            0
PPNO_New2                     292
Third_Party_Award_Date_New    145
Allocation_Date_New            83
Completion_Date_New            83
dtype: int64

## Final CSV Version

In [29]:
#drop old column
#df_allocation = df_allocation.drop(['PPNO_New2','Allocation_Date','Completion_Date','Third_Party_Award_Date'], axis=1)
#df_allocation = df_allocation.rename(columns = {'Allocation_Date_New':'Allocation_Date', 'Completion_Date_New':'Completion_Date', 'Third_Party_Award_Date_New':'Third_Party_Award_Date'})

In [30]:
#df_allocation.to_csv("./final_df_allocation.csv", index= False)

# Cleaning Project Sheet



In [31]:
df_project.columns

Index(['Award_Year', 'Project_#', 'Local_Agency', 'Project_Title', 'PPNO',
       'Key_Project_Elements', 'TIRCP_Award_Amount_($)', 'Allocated_Amount',
       'Expended_Amount'],
      dtype='object')

## Filling NA for TIRCP, Allocated, and Expended Amounts

In [32]:
df_project[['TIRCP_Award_Amount_($)', 'Allocated_Amount','Expended_Amount']] = df_project[['TIRCP_Award_Amount_($)', 'Allocated_Amount', 'Expended_Amount']].fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


## Cleaning up PPNO Numbers based on Allocation Sheet

In [33]:
#slicing PPNO to be 5 characters
df_project = df_project.assign(PPNO_New = df_project['PPNO'].str.slice(start=0, stop=5)) 

In [34]:
#importing Excel crosswalk sheet
project_ppno = pd.read_excel(open('Projects_PPNO.xlsx','rb')) 

In [35]:
#Merge in Crosswalk 
df_project2 = pd.merge(df_project, project_ppno, on = ["Award_Year", "Local_Agency"], how = "left")

In [36]:
#some values in PPNO and PPNO_New2 are strings, some are floats...so have to convert PPNO New 2 to strings
df_project2.PPNO_New = df_project2.apply(lambda x: x.PPNO_New if (str(x.PPNO_New2) == 'nan') else x.PPNO_New2, axis=1)

In [37]:
#making sure PPNO_New is a string 
df_project2 = df_project2.astype({'PPNO_New': 'str'})

In [38]:
PPNO_project = set(df_project2.PPNO_New.unique().tolist())
PPNO_allocation = set(df_allocation.PPNO.unique().tolist())

In [39]:
PPNO_project #looking at list

{'1155',
 '1230',
 '2320B',
 'CP001',
 'CP002',
 'CP003',
 'CP004',
 'CP005',
 'CP006',
 'CP007',
 'CP008',
 'CP010',
 'CP011',
 'CP012',
 'CP013',
 'CP014',
 'CP015',
 'CP016',
 'CP017',
 'CP019',
 'CP020',
 'CP021',
 'CP022',
 'CP023',
 'CP025',
 'CP026',
 'CP027',
 'CP028',
 'CP029',
 'CP030',
 'CP031',
 'CP032',
 'CP033',
 'CP034',
 'CP035',
 'CP036',
 'CP039',
 'CP041',
 'CP045',
 'CP046',
 'CP047',
 'CP048',
 'CP051',
 'CP053',
 'CP054',
 'CP055',
 'CP057',
 'CP058',
 'CP059',
 'CP060',
 'CP061',
 'CP062',
 'CP063',
 'CP064',
 'CP065',
 'CP066',
 'CP067',
 'CP068',
 'CP069',
 'CP070',
 'CP071',
 'CP072',
 'CP073',
 'CP074',
 'CP075',
 'CP076',
 'CP077',
 'CP078',
 'CP079',
 'CP080'}

In [40]:
PPNO_allocation #looking at list

{'1155',
 '1230',
 '2320B',
 'CP001',
 'CP002',
 'CP003',
 'CP004',
 'CP005',
 'CP006',
 'CP007',
 'CP008',
 'CP010',
 'CP011',
 'CP012',
 'CP013',
 'CP014',
 'CP015',
 'CP016',
 'CP017',
 'CP018',
 'CP019',
 'CP020',
 'CP021',
 'CP022',
 'CP023',
 'CP024',
 'CP025',
 'CP026',
 'CP027',
 'CP028',
 'CP029',
 'CP030',
 'CP031',
 'CP032',
 'CP033',
 'CP034',
 'CP035',
 'CP036',
 'CP039',
 'CP041',
 'CP042',
 'CP043',
 'CP045',
 'CP046',
 'CP047',
 'CP048',
 'CP051',
 'CP053',
 'CP054',
 'CP055',
 'CP057',
 'CP058',
 'CP059',
 'CP060',
 'CP061',
 'CP062',
 'CP063',
 'CP064',
 'CP065',
 'CP066',
 'CP067',
 'CP068',
 'CP069',
 'CP070',
 'CP071',
 'CP072',
 'CP073',
 'CP074',
 'CP075',
 'CP076',
 'CP077',
 'CP078',
 'CP079',
 'CP080'}

In [41]:
PPNO_project - PPNO_allocation #checking for differences

set()

## Cleaning up Award Recipients
- Matching up Award Recipient in allocation sheet & Local Agency in projects sheet
    - Using Allocation as the "source of truth" data set for the agencies who received the awards 
    - [Stack Overflow Ref](https://stackoverflow.com/questions/61811137/based-on-partial-string-match-fill-one-data-frame-column-from-another-dataframe)
    

In [42]:
#make sure these are strings
df_project2 = df_project2.astype({'Local_Agency': 'str'})
df_allocation = df_allocation.astype({'Award_Recipient': 'str'})


In [43]:
#using allocation as the "source of truth" data set for the agencies who received the awards 
allocation_agencies = df_allocation.Award_Recipient.unique().tolist()
allocation_agencies

['Antelope Valley Transit Authority ',
 'Capitol Corridor Joint Powers Authority',
 'Los Angeles County Metropolitan Transportation Authority',
 'Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)',
 'Monterey-Salinas Transit',
 'Orange County Transportation Authority (OCTA)',
 'Sacramento Regional Transit District',
 'San Diego Association of Governments (SANDAG)',
 'San Diego Metropolitan Transit System (MTS)',
 'San Francisco Municipal Transportation Agency',
 'San Joaquin Regional Rail Commission',
 'San Joaquin Regional Transit District',
 'Southern California Regional Rail Authority (Metrolink)',
 'Sonoma-Marin Area Rail Transit District',
 'Capitol Corridor Joint Powers Authority ',
 'Foothill Transit',
 'City of Fresno',
 'Peninsula Corridor Joint Powers Board ',
 'San Bernardino County Transportation Authority (SBCTA)',
 'Santa Clara Valley Transportation Authority',
 'Alameda Contra Costa Transit District ',
 'Anaheim Transportation Network (ATN)',
 'Antelope

In [44]:
#stack overflow method
df_project2['Award_Recipient'] = df_project2['Local_Agency'].apply(lambda x: ''.join([part for part in allocation_agencies if part in x]))

In [45]:
df_project2.head(2)

Unnamed: 0,Award_Year,Project_#,Local_Agency,Project_Title,PPNO,Key_Project_Elements,TIRCP_Award_Amount_($),Allocated_Amount,Expended_Amount,PPNO_New,PPNO_New2,Award_Recipient
0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environme...,CP005,Purchase 13 60-foot articulated BRT buses and ...,24403000.0,24403000,21714177.53,CP005,,Antelope Valley Transit Authority Antelope Val...
1,2015,2,Capitol Corridor Joint Powers Authority,Travel Time Reduction Project,CP012,Track and curve improvements between San Jose ...,4620000.0,4620000,4619999.9,CP012,,Capitol Corridor Joint Powers Authority


In [46]:
# Make use-me_to_rename into dict, so you can map using key!
# Standardizes Award Recipient across same PPNO_New values
rename_recipients = df_project2[["Award_Recipient", "PPNO_New"]].sort_values(["Award_Recipient", "PPNO_New"]).drop_duplicates(subset=["PPNO_New"])


In [47]:
rename_recipients = rename_recipients.set_index('PPNO_New').T.to_dict('list')

In [48]:
rename_recipients

{'CP002': [''],
 'CP006': [''],
 'CP013': [''],
 'CP025': [''],
 'CP026': [''],
 'CP055': [''],
 'CP065': [''],
 'CP071': [''],
 'CP074': [''],
 '2320B': ['Alameda Contra Costa Transit District '],
 'CP027': ['Anaheim Transportation Network (ATN)'],
 'CP005': ['Antelope Valley Transit Authority Antelope Valley Transit Authority'],
 'CP019': ['Antelope Valley Transit Authority Antelope Valley Transit Authority'],
 'CP028': ['Antelope Valley Transit Authority Antelope Valley Transit Authority'],
 'CP059': ['Antelope Valley Transit Authority Antelope Valley Transit Authority'],
 'CP060': ['Bay Area Rapid Transit District'],
 'CP012': ['Capitol Corridor Joint Powers Authority'],
 'CP023': ['Capitol Corridor Joint Powers Authority'],
 'CP061': ['Capitol Corridor Joint Powers Authority'],
 'CP036': ['Capitol Corridor Joint Powers AuthorityCapitol Corridor Joint Powers Authority '],
 'CP016': ['City of Fresno'],
 'CP079': ['City of Fresno'],
 'CP062': ['City of Inglewood'],
 'CP029': ['City o

In [49]:
#updating dictionary to address blanks 
rename_recipients.update({'CP002': 'Southern California Regional Rail Authority (Metrolink)',
 'CP006': 'San Francisco Municipal Transportation Agency',
 'CP013': 'Monterey-Salinas Transit',
 'CP025': 'San Joaquin Regional Rail Commission',
 'CP026': 'San Joaquin Regional Rail Commission',
 'CP055': 'Bay Area Rapid Transit District',
 'CP065': 'Los Angeles County Metropolitan Transportation Authority and Southern California Regional Rail Authority',
 'CP071': 'City of Santa Monica',
 'CP074': 'Transit Joint Powers Authority for Merced County',
 'CP019': 'Antelope Valley Transit Authority & Long Beach Transit',
 'CP068': 'San Bernardino County Transportation Authority (SBCTA) & Omnitrans'})

In [50]:
df_project2 = df_project2.assign(
    Award_Recipient = df_project2.PPNO_New.map(rename_recipients))

In [51]:
#creating a dictionary for the values that didn't populate
crosswalk = {'Monterey-Salinas Transit': 'Monterey-Salinas Transit',
             'San Francisco Municipal  Transportation Agency': 'San Francisco Municipal Transportation Agency',
            'Southern California  Regional Rail Authority': 'Southern California Regional Rail Authority',
            'San Joaquin Regional Rail Commission / San Joaquin Joint Powers Authority':  'San Joaquin Joint Powers Authority and San Joaquin Regional Rail Commission',
            'Bay Area Rapid Transit (BART)': 'Bay Area Rapid Transit District',
            'LA County Metropolitan Transportation Authority, So Cal Regional Rail Authority (Metrolink)':  'Los Angeles County Metropolitan Transportation Authority and Southern California Regional Rail Authority',
            'Santa Monica Big Blue Bus':  'City of Santa Monica',
            'Transit Joint Powers Authority of Merced County':  'Transit Joint Powers Authority for Merced County',
             'Los Angeles County Metropolitan Transportation AuthorityLos Angeles County Metropolitan Transportation Authority ': 'Los Angeles County Metropolitan Transportation Authority ',
             'Antelope Valley Transit Authority Antelope Valley Transit Authority': 'Antelope Valley Transit Authority ',
             'Sacramento Regional Transit DistrictSacramento Regional Transit District ': 'Sacramento Regional Transit District',
             'Capitol Corridor Joint Powers AuthorityCapitol Corridor Joint Powers Authority ': 'Capitol Corridor Joint Powers Authority',
             'San Joaquin Regional\nRail Commission / San Joaquin Joint Powers Authority': 'San Joaquin Joint Powers Authority and San Joaquin Regional Rail Commission'
}

In [52]:
#populating
df_project2 = df_project2.replace(crosswalk)

In [53]:
df_project2.sample(5)

Unnamed: 0,Award_Year,Project_#,Local_Agency,Project_Title,PPNO,Key_Project_Elements,TIRCP_Award_Amount_($),Allocated_Amount,Expended_Amount,PPNO_New,PPNO_New2,Award_Recipient
70,2020,14,Solano Transportation Authority,Solano Regional Transit Improvements Phase 2,,"Improve the frequency, reliability, and access...",10400000.0,2900000,0.0,CP072,CP072,[Solano Transportation Authority]
16,2016,3,Foothill Transit,"Transforming California: Bus Electrification, ...",CP076,Purchase 2- zero-emission buses to extend Rout...,5000000.0,5000000,0.0,CP076,,[Foothill Transit]
21,2016,8,Orange County Transportation Authority (OCTA),OC Streetcar and OCTA System-Wide Mobile Ticke...,CP017,Construct OC Streetcar project connecting Sant...,28000000.0,28000000,0.0,CP017,,[Orange County Transportation Authority (OCTA)]
54,2018,26,Sonoma-Marin Area Rail Transit District (SMART),SMART Larkspur to Windsor Corridor,CP041,Completes critical rail segments extending rai...,21000000.0,21000000,11918142.74,CP041,,[Sonoma-Marin Area Rail Transit District]
11,2015,12,San Joaquin Regional Transit District,Bus Rapid Transit – Martin Luther King Corrido...,CP011,Bus rapid transit infrastructure along the MLK...,6841000.0,6841000,6841000.0,CP011,,[San Joaquin Regional Transit District]


In [54]:
df_project2.to_csv("./test_df_project.csv")

### Double check to make sure award recipient names are the same

In [55]:
def diff(list1, list2):
    c = set(list1).union(set(list2))  
    d = set(list1).intersection(set(list2)) 
    return list(c - d)

## Final Version

In [56]:
#drop old column
#df_project2 = df_project2.drop(['PPNO','PPNO_New2'], axis=1)

In [57]:
#renaming 
#df_project2 = df_project2.rename(columns = {'PPNO_New':'PPNO'})

In [58]:
#df_project2.to_csv("./final_df_project.csv", index= False)