In [251]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *
import intake
import data_prep
import fuzzymatcher
pd.options.display.max_rows = 250

In [252]:
df = data_prep.load_grantprojects()
vehicles = data_prep.load_vehiclesdata()
gtfs_status = data_prep.load_catalog_gtfs()

# Cleaning up Vehicle Data: stuff to put into the script
* Filtering out for only rural reporters.

In [253]:
#Add up columns 0-12 to get a new bin
vehicles['0-12'] = vehicles[[0,1,2,3,4,5,6,7,8,9,10,11,12]].sum(axis=1)                            

In [254]:
#dropping columns. no need for 0-12 anymore
vehicles = vehicles.drop(columns=[0,1,2,3,4,5,6,7,8,9,10,11,12])

In [255]:
#only want rural reporters
vehicles = vehicles.loc[vehicles['Reporter Type'] == 'Rural Reporter'] 

In [256]:
#convert to snakecase
vehicles = to_snakecase(vehicles)

In [257]:
#should group vehicle types together
vehicles.vehicle_type.unique().tolist()

['Van',
 'Trucks and other Rubber Tire Vehicles (Service)',
 'Minivan',
 'Cutaway',
 'Bus',
 'Automobile',
 'Automobiles (Service)',
 'Over-the-road Bus',
 'Sports Utility Vehicle']

In [258]:
#creating a list 
Automobiles = ['Automobile','Automobiles (Service)','Sports Utility Vehicle']
Bus = ['Bus','Over-the-road Bus']
Vans = ['Van','Trucks and other Rubber Tire Vehicles (Service)','Minivan','Cutaway',]

In [259]:
def replace_modes(row):
    if row.vehicle_type in Automobiles:
        return "Rail"
    elif row.vehicle_type in Bus:
        return "Bus"
    else:
        return "Vans"

In [260]:
vehicles["vehicle_groups"] = vehicles.apply(lambda x: replace_modes(x), axis=1) 

In [261]:
vehicles.reporter_type.unique().tolist()

['Rural Reporter']

In [262]:
vehicles.head(2)

Unnamed: 0,agency,city,state,legacy_ntd_id,ntd_id,organization_type,reporter_type,primary_uza_population,agency_voms,vehicle_type,years_old:,_13_15,_16_20,_21_25,_26_30,_31_60,_60+,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle,_0_12,vehicle_groups
1362,Fresno County Rural Transit Agency,Fresno,CA,9R02-025,9R02-91007,Independent Public Agency or Authority of Tran...,Rural Reporter,0,77,Van,,0,0,0,0,0,0,18,2.555556,0.0,18,Vans
1363,Fresno County Rural Transit Agency,Fresno,CA,9R02-025,9R02-91007,Independent Public Agency or Authority of Tran...,Rural Reporter,0,77,Trucks and other Rubber Tire Vehicles (Service),,0,0,0,0,0,0,3,,,3,Vans


### Looking at unique rural agencies

In [263]:
#subset for crosswalk
vehicles_crosswalk = vehicles[['agency','ntd_id']]

In [264]:
#drop duplicates for crosswalk
vehicles_crosswalk = vehicles_crosswalk.drop_duplicates()

In [265]:
vehicles_crosswalk.sort_values('agency')

Unnamed: 0,agency,ntd_id
6668,Alpine County Local Transportation Commission,9R02-91116
4637,Amador Regional Transit System,9R02-91000
5797,Calaveras Transit Agency,9R02-99442
6451,City of Arcata,9R02-91018
5788,City of Arvin,9R02-91027
6001,City of Auburn,9R02-91032
6460,City of California City,9R02-91111
6665,"City of Chowchilla, dba: Chowchilla Area Transit",9R02-91071
5786,"City of Corcoran, dba: Corcoran Area Transit",9R02-91002
6004,City of Dinuba,9R02-91040


# Using Organizations CSV (Put in Script Later)
* Just need these 2 columns along w/ NTD ID and ITP ID? 
* Complete static GTFS coverage (1=yes)	Complete RT coverage

In [266]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/5311 /"
FILE_NAME = "organizations-all_organizations.csv"
organizations = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")

In [267]:
organizations.head(2)

Unnamed: 0,Name,Organization Type,Roles,Record Creation Time,NTP ID,ITP ID,Brand,Alias',Details,Website,Parent Organization,Administrating Organization,Mobility Services Managed,Missing Static,Funding Sources for Managed Transportation,Mobility Services Operated,GTFS Datasets Produced,Service Type (from Mobility Services Managed),Currently Operating (from Mobility Services Managed),Currently Operating (from Mobility Services Operated),Service Type (from Mobility Services Operated),Headquarters,funding programs,Total VOMS (NTD),Service Area Sq Miles (NTD),Service Area Population (NTD),Caltrans District,MPO/RTPA,Planning Authority,OPM_ID_DRMT,Tracking Category,Reporting Category,Assist Category,eligibility programs,gtfs datasets,GTFS Dataset (from Mobility Services Managed),Fare Systems,GTFS Schedule Status,Fares v2 Status,Flex Status,Services Needing Alerts,Services Needing TripUpdates or VehiclePositions,# of Fixed-Route Services,# Services w/ Complete RT Status,# Fixed-Route Services w/ Static GTFS,Complete static GTFS coverage (1=yes),Complete RT coverage,>=1 GTFS feed for any service (1=yes),>= 1 complete RT set (1=yes)
0,A-Paratransit,,,9/21/2021 7:51pm,,,,,,,,,,,,East Bay Paratransit,,,,1 checked out of 1,ADA paratransit,,,,,,,,,,,,,,,,,,,,,,0,0,0,1,1,0,0
1,ABC Shuttle,Company,,10/11/2021 9:43pm,,,,,Airport Shuttle to/from John Wayne Airport.,http://www.abcshuttleatjohnwayne.com/,,,ABC Airport Shuttle,,private,ABC Airport Shuttle,,on-demand,1 checked out of 1,1 checked out of 1,on-demand,,,,,,,,,,,,,,,,,needed,,Needed - Non Priority,ABC Airport Shuttle,ABC Airport Shuttle,0,0,0,1,1,0,0


In [268]:
#put in script
organizations = to_snakecase(organizations)

In [269]:
organizations.columns

Index(['name', 'organization_type', 'roles', 'record_creation_time', 'ntp_id',
       'itp_id', 'brand', 'alias', 'details', 'website', 'parent_organization',
       'administrating_organization', 'mobility_services_managed',
       'missing_static', 'funding_sources_for_managed_transportation',
       'mobility_services_operated', 'gtfs_datasets_produced',
       'service_type__from_mobility_services_managed_',
       'currently_operating__from_mobility_services_managed_',
       'currently_operating__from_mobility_services_operated_',
       'service_type__from_mobility_services_operated_', 'headquarters',
       'funding_programs', 'total_voms__ntd_', 'service_area_sq_miles__ntd_',
       'service_area_population__ntd_', 'caltrans_district', 'mpo_rtpa',
       'planning_authority', 'opm_id_drmt', 'tracking_category',
       'reporting_category', 'assist_category', 'eligibility_programs',
       'gtfs_datasets', 'gtfs_dataset__from_mobility_services_managed_',
       'fare_systems', 

In [270]:
#put in script, keep ony certain cols
organizations = organizations[['name','ntp_id','itp_id','gtfs_schedule_status','#_services_w__complete_rt_status',
       '#_fixed_route_services_w__static_gtfs',
       'complete_static_gtfs_coverage__1=yes_', 'complete_rt_coverage',
       '>=1_gtfs_feed_for_any_service__1=yes_',
       '>=_1_complete_rt_set__1=yes_']]

In [271]:
organizations.isna().sum()

name                                       0
ntp_id                                   582
itp_id                                   409
gtfs_schedule_status                     361
#_services_w__complete_rt_status           0
#_fixed_route_services_w__static_gtfs      0
complete_static_gtfs_coverage__1=yes_      0
complete_rt_coverage                       0
>=1_gtfs_feed_for_any_service__1=yes_      0
>=_1_complete_rt_set__1=yes_               0
dtype: int64

In [272]:
organizations.shape

(781, 10)

In [273]:
organizations_crosswalk = organizations[['name','ntp_id', 'itp_id']]

In [274]:
#put in script, renaming columns
organizations_crosswalk = organizations_crosswalk.rename(columns = {'name':'agency','ntp_id':'ntd_id'})

# Help

* Only 26 records match between vehicles & GTFS status on agency & NTD ID.

In [275]:
df_joined1 = organizations_crosswalk.merge(vehicles_crosswalk.drop_duplicates(), on=['agency', 'ntd_id'],  how='left', indicator=True)

In [276]:
df_joined1._merge.value_counts()

left_only     755
both           26
right_only      0
Name: _merge, dtype: int64

In [277]:
#filtering out for only agencies with both...
df_joined_both = df_joined1.loc[df_joined['_merge'] == 'both']

In [278]:
df_joined_both=df_joined_both.drop_duplicates()

In [279]:
df_joined_both

Unnamed: 0,agency,ntd_id,itp_id,_merge
14,Amador Regional Transit System,9R02-91000,11.0,both
95,City of Arcata,9R02-91018,18.0,both
98,City of Arvin,9R02-91027,21.0,both
101,City of Auburn,9R02-91032,23.0,both
122,City of California City,9R02-91111,51.0,both
155,City of Dinuba,9R02-91040,93.0,both
191,City of Guadalupe,9R02-91043,129.0,both
237,City of McFarland,9R02-91110,197.0,both
258,City of Ojai,9R02-91058,231.0,both
324,City of Solvang,9R02-91028,312.0,both


# How many agencies do not have ITP ID..
* 25  don't have IDS?

In [280]:
df_joined2 = vehicles_crosswalk.merge(organizations_crosswalk.drop_duplicates(), on=['agency','ntd_id'],  how='left', indicator=True)
df_joined2._merge.value_counts()

left_only     26
both          26
right_only     0
Name: _merge, dtype: int64

In [281]:
df_joined2

Unnamed: 0,agency,ntd_id,itp_id,_merge
0,Fresno County Rural Transit Agency,9R02-91007,117.0,both
1,Eastern Sierra Transit Authority,9R02-91062,99.0,both
2,Kern Regional Transit,9R02-91059,,left_only
3,Lake Transit Authority,9R02-91053,159.0,both
4,Mendocino Transit Authority,9R02-91047,198.0,both
5,Tulare County Area Transit,9R02-91055,346.0,both
6,Tuolumne County Transit,9R02-91057,,left_only
7,San Benito County LTA,9R02-91009,,left_only
8,"Mountain Area Regional Transit Authority, dba:...",9R02-91012,,left_only
9,Nevada County Transit Services,9R02-91095,,left_only


In [282]:
df_joined2.isna().sum()

agency     0
ntd_id     0
itp_id    26
_merge     0
dtype: int64

# 5311 Data Crosswalk Test
* Comparing the agency names in 5311 against Vehicle Data to ensure everything is the same using fuzzy matcher https://pbpython.com/record-linking.html

In [283]:
df.head(2)

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status
0,2011,Section 5311,CA-18-X047 | 0012000083,2016,City of Chowchilla,BCG0000228,Operating Assistance,300902,64BO17-00368,53221.0,114511.0,53221.0,0.0,0,Open
1,2011,Section 5311,CA-18-X047 | 0012000083,2016,Madera County,BCG0000283,Buy <30-Ft Bus For Expansion,111304,64BC17-00408,110663.0,110663.0,101352.02,9310.98,0,Open


In [284]:
#put in script, filter funding program
df = to_snakecase(df)
subset = ['Section 5311', '5310 Exp', '5310 Trad', '5311(f) Cont', 'CMAQ (FTA 5311)',
       'Section 5311(f)','5311(f) Round 2']
df = df[df.funding_program.isin(subset)]

In [285]:
#subset for crosswalk
crosswalk_5311 = df[['organization_name']]
crosswalk_5311 = crosswalk_5311.drop_duplicates()

### Testing join on our crosswalk subsets
* Left join on GTFS & Vehicle crosswalk

In [286]:
left_on = ["organization_name"]
right_on = ["agency"]

In [287]:
matched_results = fuzzymatcher.fuzzy_left_join(df_joined_both,crosswalk_5311, right_on, left_on)

### Looking at best match score...
* Tulare, Amador, and Trinity need to be filtered out.


In [288]:
matched_results.sort_values('best_match_score')

Unnamed: 0,best_match_score,__id_left,__id_right,agency,ntd_id,itp_id,_merge,organization_name
28,0.011467,24_left,44_right,Tulare County Area Transit,9R02-91055,346.0,both,Kings County Area Public Transit Agency
0,0.042136,0_left,43_right,Amador Regional Transit System,9R02-91000,11.0,both,Kern Regional Transit
27,0.107495,23_left,72_right,Trinity County,9R02-91035,344.0,both,Trinity County Department of Transportation
25,0.187038,21_left,69_right,Tehama County,9R02-91089,334.0,both,Tehama County Transit Agency
8,0.339274,4_left,14_right,City of California City,9R02-91111,51.0,both,City of California City
5,0.371082,1_left,4_right,City of Arcata,9R02-91018,18.0,both,City of Arcata
7,0.371082,3_left,13_right,City of Auburn,9R02-91032,23.0,both,City of Auburn
13,0.371082,9_left,25_right,City of Solvang,9R02-91028,312.0,both,City of Solvang
6,0.371082,2_left,12_right,City of Arvin,9R02-91027,21.0,both,City of Arvin
11,0.371082,7_left,19_right,City of McFarland,9R02-91110,197.0,both,City of McFarland


In [250]:
f'we have a total of {matched_results.agency.nunique()} unique organization names...'

'we have a total of 26 unique organization names...'