# Crosswalking our data

In [351]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *
import intake
import data_prep
import fuzzymatcher
pd.options.display.max_rows = 140

In [352]:
df = data_prep.load_grantprojects()
vehicles = data_prep.load_vehiclesdata()
agency_info = data_prep.load_agencyinfo()
gtfs_status = data_prep.load_catalog_gtfs()

# Vehicle Data

In [353]:
#drop some columns.
vehicles = vehicles.drop(columns=['Years Old:',
                                          0,
                                          1,
                                          2,
                                          3,
                                          4,
                                          5,
                                          6,
                                          7,
                                          8,
                                          9,
                                         10,
                                         11,
                                         12,
                                    '13-15',
                                    '16-20',
                                    '21-25',
                                    '26-30',
                                    '31-60',
                                      '60+'])

In [354]:
#cleaning columns
vehicles = to_snakecase(vehicles)

In [356]:
vehicles.shape

(705, 13)

In [357]:
vehicles.head(2)

Unnamed: 0,agency,city,state,legacy_ntd_id,ntd_id,organization_type,reporter_type,primary_uza_population,agency_voms,vehicle_type,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle
25,Los Angeles County Metropolitan Transportation...,Los Angeles,CA,9154,90154,Independent Public Agency or Authority of Tran...,Full Reporter,12150996,3482,Van,1208,1.440397,31456.6134
26,Los Angeles County Metropolitan Transportation...,Los Angeles,CA,9154,90154,Independent Public Agency or Authority of Tran...,Full Reporter,12150996,3482,Trucks and other Rubber Tire Vehicles (Service),961,,


In [358]:
#subset for crosswalk
vehicles_crosswalk = vehicles[['agency','ntd_id']]

In [359]:
#drop duplicates for crosswalk
vehicles_crosswalk = vehicles_crosswalk.drop_duplicates()

# Agency Data
* Our source of truth

In [360]:
#making the crosswalk
agency_info_crosswalk = agency_info[['agency_name','ntd_id']]

In [361]:
#drop dupcliates
agency_info_crosswalk = agency_info_crosswalk.drop_duplicates()

In [362]:
#rename column of agency_name to avoid confusion 
agency_info_crosswalk = agency_info_crosswalk.rename(columns={'agency_name':'crosswalk_agency'})

# Crosswalk Agency and Vehicle Data
* Comparing Agency Info & Vehicle Data Sets to ensure they have the same agency names & NTD ID
* Merging on left because that's for vehicle crosswalk.
* Looks like stuff matches? We get 0 for left only & right only

In [363]:
df_all = vehicles_crosswalk.merge(agency_info_crosswalk.drop_duplicates(), on=['ntd_id'], 
                   how='left', indicator=True)
df_all

Unnamed: 0,agency,ntd_id,crosswalk_agency,_merge
0,Los Angeles County Metropolitan Transportation...,90154,Los Angeles County Metropolitan Transportation...,both
1,Orange County Transportation Authority,90036,Orange County Transportation Authority,both
2,Access Services,90157,Access Services,both
3,"City and County of San Francisco, dba: San Fra...",90015,City and County of San Francisco,both
4,San Diego Metropolitan Transit System,90026,San Diego Metropolitan Transit System,both
...,...,...,...,...
213,Playa Vista Parks and Landscape Corp.,A0003-99446,Playa Vista Parks and Landscape Corp.,both
214,City of Sierra Madre,A0003-99447,City of Sierra Madre,both
215,City of El Segundo,A0003-99449,City of El Segundo,both
216,City of Hawaiian Gardens,A0003-99450,City of Hawaiian Gardens,both


In [364]:

df_all >> count(_._merge) >> arrange(-_.n)

Unnamed: 0,_merge,n
2,both,218
0,left_only,0
1,right_only,0


# 5311 Data Crosswalk
* Comparing agency names against Vehicle Data to ensure everything is the same using fuzzy matcher https://pbpython.com/record-linking.html

In [365]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/5311 /"
FILE_NAME = "5311_blackcat_clean.csv"
df_5311 = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")

In [366]:
#subset for crosswalk
crosswalk_5311 = df_5311[['Organization_Name']]
crosswalk_5311 = crosswalk_5311.drop_duplicates()

In [367]:
left_on = ["Organization_Name"]
right_on = ["agency"]

In [368]:
matched_results = fuzzymatcher.fuzzy_left_join(crosswalk_5311,vehicles_crosswalk, left_on,
                                            right_on,)

In [369]:
matched_results.sample(4)

Unnamed: 0,best_match_score,__id_left,__id_right,Organization_Name,agency,ntd_id
472,0.902053,61_left,52_right,San Luis Obispo Regional Transit Authority,San Luis Obispo Regional Transit Authority,90206
447,0.167093,41_left,40_right,Glenn County Transportation Commission,Riverside County Transportation Commission,90218
380,0.85132,35_left,210_right,County of Shasta Department of Public Works,County of Shasta Department of Public Works,9R02-99438
577,0.586271,74_left,13_right,Victor Valley Transit Authority,Victor Valley Transit Authority,90148


In [370]:
f'we have a total of {matched_results.Organization_Name.nunique()} unique organization names...'

'we have a total of 89 unique organization names...'

# Actually joining Vehicles & 5311 Blackcat Data

In [371]:
df_join = fuzzymatcher.fuzzy_left_join(df_5311,vehicles, left_on, right_on)

In [375]:
df_join.columns

Index(['best_match_score', '__id_left', '__id_right', 'Grant_Fiscal_Year',
       'Funding_Program', 'Grant_Number', 'Project_Year', 'Organization_Name',
       'UPIN', 'Description', 'ALI', 'Contract_Number', 'AllocationAmount',
       'Encumbered_Amount', 'ExpendedAmount', 'ActiveBalance',
       'ClosedOutBalance', 'Project_Status', 'agency', 'city', 'state',
       'legacy_ntd_id', 'ntd_id', 'organization_type', 'reporter_type',
       'primary_uza_population', 'agency_voms', 'vehicle_type',
       'total_vehicles', 'average_age_of_fleet__in_years_',
       'average_lifetime_miles_per_vehicle'],
      dtype='object')

In [380]:
#subsetting
df_join1 = df_join[['agency',
     'ntd_id', 'organization_type', 'reporter_type','Grant_Fiscal_Year',
       'Funding_Program', 'Grant_Number', 'Project_Year',
        'Description', 'ALI', 'Contract_Number', 'AllocationAmount',
       'Encumbered_Amount', 'ExpendedAmount', 'ActiveBalance',
       'ClosedOutBalance', 'Project_Status', 
       'primary_uza_population', 'agency_voms', 'vehicle_type',
       'total_vehicles', 'average_age_of_fleet__in_years_',
       'average_lifetime_miles_per_vehicle']]

In [381]:
df_join1.sample(3)

Unnamed: 0,agency,ntd_id,organization_type,reporter_type,Grant_Fiscal_Year,Funding_Program,Grant_Number,Project_Year,Description,ALI,Contract_Number,AllocationAmount,Encumbered_Amount,ExpendedAmount,ActiveBalance,ClosedOutBalance,Project_Status,primary_uza_population,agency_voms,vehicle_type,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle
6134,Redwood Coast Transit Authority,9R02-91097,Independent Public Agency or Authority of Tran...,Rural Reporter,2019,5339 (National),CA-2021-034 | 0021000216,2019,Rehab/Rebuild STD 35 Ft Bus,111402,64GC19-01465,31777.0,31777.0,0.0,31777.0,0,Open,0,8,Trucks and other Rubber Tire Vehicles (Service),1,,
211,City of Arvin,9R02-91027,"City, County or Local Government Unit or Depar...",Rural Reporter,2016,Section 5311,CA-2017-025 | 0017000158,2016,Operating Assistance Sliding Scale,300902,64BO17-00365,62188.0,62188.0,62188.0,0.0,0,Open,0,6,Cutaway,3,9.333333,0.0
6819,El Dorado County Transit Authority,90229,Independent Public Agency or Authority of Tran...,Reduced Reporter,2019,Section 5311,CA-2020-015 | 0020000057,2019,Operating Assistance Sliding Scale,300902,64BO19-00950,509322.0,509322.0,509322.0,0.0,0,Open,1723634,29,Trucks and other Rubber Tire Vehicles (Service),3,,
