In [156]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *
import intake
import data_prep
import fuzzymatcher
pd.options.display.max_rows = 250

# Script Testing
## Vehicles - Help
* Can't get grouping to work as an inner function

In [157]:
df = data_prep.load_grantprojects()
vehicles = data_prep.load_vehiclesdata()
organizations= data_prep.load_organizations_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [158]:
# Grouping vehicle types together: not working in function
Automobiles = ['Automobile','Automobiles (Service)','Sports Utility Vehicle']
Bus = ['Bus','Over-the-road Bus']
Vans = ['Van','Trucks and other Rubber Tire Vehicles (Service)','Minivan','Cutaway',]

In [159]:
def replace_modes(row):
        if row.vehicle_type in Automobiles:
            return "Rail"
        elif row.vehicle_type in Bus:
            return "Bus"
        else:
            return "Vans"
vehicles["vehicle_groups"] = vehicles.apply(lambda x: replace_modes(x), axis=1)

# Basic Facts about our data

### Vehicles 

In [160]:
f'{vehicles.agency.nunique()} total unique agencies under the rural reporter category.'

'52 total unique agencies under the rural reporter category.'

In [161]:
#subset for crosswalk
vehicles_crosswalk = vehicles[['agency','ntd_id']]
#drop duplicates for crosswalk
vehicles_crosswalk = vehicles_crosswalk.drop_duplicates()

### Black Cat data for only 5311 programs
* Most funding program is just section 5311
* What differentiates them?

In [162]:
df.funding_program.value_counts()

Section 5311       590
Section 5311(f)    109
5311(f) Cont        41
CMAQ (FTA 5311)     31
5311(f) Round 2     27
Name: funding_program, dtype: int64

In [163]:
f'a total of {df.organization_name.nunique()} unique agencies after filtering for 5311'

'a total of 88 unique agencies after filtering for 5311'

In [164]:
#subset for crosswalk
crosswalk_5311 = df[['organization_name']]
crosswalk_5311 = crosswalk_5311.drop_duplicates()

### GTFS Data
* Has NTD ID, ITP ID, and GTFS information
* After deleting all rows that do not have NTP IDs, rows went from 781 rows to 199
* Not every agency has a ITP id.
* So only 199 agencies have NTD IDS in the organizations data set.

In [165]:
organizations2 = organizations.dropna(subset=['ntd_id'])

In [166]:
organizations2.isna().sum()

agency                                    0
ntd_id                                    0
itp_id                                   12
gtfs_schedule_status                     18
#_services_w__complete_rt_status          0
#_fixed_route_services_w__static_gtfs     0
complete_static_gtfs_coverage__1=yes_     0
complete_rt_coverage                      0
>=1_gtfs_feed_for_any_service__1=yes_     0
>=_1_complete_rt_set__1=yes_              0
dtype: int64

In [167]:
f'a total of {organizations.agency.nunique()} unique agencies'

'a total of 781 unique agencies'

In [168]:
f'a total of {organizations2.agency.nunique()} unique agencies after only keeping agencies with NTD IDS'

'a total of 199 unique agencies after only keeping agencies with NTD IDS'

In [169]:
#subset for crosswalk
organizations_crosswalk = organizations2[['agency','ntd_id', 'itp_id']]

# Crosswalk Cal ITP with NTD
### Left join on Cal ITP's "organizations data set" with GTFS with vehicles
* Using left join, organization crosswalk on the L. Vehicles on the R. 
* Only 1/2 (26) of agencies in vehicles (total 52) appears in Cal ITP
* 173 of the records in organizations have Cal ITP ids but aren't in the vehicles dataset, which makes sense since these are tons of non-rural reporters in the organizations data set

In [170]:
df_joined1 = organizations_crosswalk.merge(vehicles_crosswalk.drop_duplicates(), on=['agency', 'ntd_id'],  how='left', validate = "1:1", indicator=True)

In [171]:
df_joined1._merge.value_counts()

left_only     173
both           26
right_only      0
Name: _merge, dtype: int64

### Filtering out for only agencies with both...

In [172]:
df_joined_both = df_joined1.loc[df_joined1['_merge'] == 'both']
df_joined_both=df_joined_both.drop_duplicates()

### Filtering out for agencies that appeared in organizations but not vehicles...

In [174]:
df_organizations_only = df_joined1.loc[df_joined1['_merge'] == 'left_only']

In [175]:
df_organizations_only

Unnamed: 0,agency,ntd_id,itp_id,_merge
0,Access Services,90157,1.0,left_only
1,Alameda-Contra Costa Transit District,90014,4.0,left_only
2,Alpine County,9R02-91116,9.0,left_only
4,Anaheim Transportation Network,90211,14.0,left_only
5,Antelope Valley Transit Authority,90121,16.0,left_only
6,Bishop Paiute Tribe,99268,40.0,left_only
7,Blue Lake Rancheria,99292,42.0,left_only
8,Butte County Association of Governments,90208,47.0,left_only
9,Calaveras County,9R02-91063,50.0,left_only
10,California Department of Transportation,9R02,,left_only


## Joining using vehicles on the left this time
* Still getting a strange even split of 26 agencies are only found in vehicles, 26 are found in both?
* The agencies in df_joined_both (organizations on the L and vehicles on the R) and df_joined_both2 (vehicles on the L and organizations on the R) match.

In [176]:
df_joined2 = vehicles_crosswalk.merge(organizations_crosswalk.drop_duplicates(), on=['agency','ntd_id'], validate = "m:1", how='left', indicator=True)
df_joined2._merge.value_counts()

left_only     26
both          26
right_only     0
Name: _merge, dtype: int64

In [177]:
df_joined_both2 = df_joined2.loc[df_joined2['_merge'] == 'both']

In [178]:
df_joined_both2.sort_values('agency')

Unnamed: 0,agency,ntd_id,itp_id,_merge
14,Amador Regional Transit System,9R02-91000,11.0,both
39,City of Arcata,9R02-91018,18.0,both
24,City of Arvin,9R02-91027,21.0,both
31,City of Auburn,9R02-91032,23.0,both
43,City of California City,9R02-91111,51.0,both
32,City of Dinuba,9R02-91040,93.0,both
40,City of Guadalupe,9R02-91043,129.0,both
51,City of McFarland,9R02-91110,197.0,both
45,City of Ojai,9R02-91058,231.0,both
30,City of Solvang,9R02-91028,312.0,both


## Going to try to use fuzzy matcher on this..
* 38 matches...out of the 52 rural agencies!

* Now I can look at NTD ID to make sure these really are the same agencies.

In [246]:
left_on = ["ntd_id"]
right_on = ["ntd_id"]

In [247]:
vehicles_organization = fuzzymatcher.fuzzy_left_join(vehicles_crosswalk, organizations_crosswalk, left_on, right_on)

In [248]:
vehicles_organization.sort_values('best_match_score')

Unnamed: 0,best_match_score,__id_left,__id_right,agency_left,ntd_id_left,agency_right,ntd_id_right,itp_id
211,0.023902,27_left,172_right,Calaveras Transit Agency,9R02-99442,Shasta County,9R02-99438,
418,0.023902,49_left,172_right,"Wasco, City of",9R02-99426,Shasta County,9R02-99438,
305,0.023902,29_left,2_right,"City of Ridgecrest, dba: Ridgerunner",9R02-91006,Alpine County,9R02-91116,9.0
159,0.023902,21_left,2_right,"Mariposa County Transit, dba: Mari-Go",9R02-91082,Alpine County,9R02-91116,9.0
7,0.023902,7_left,2_right,San Benito County LTA,9R02-91009,Alpine County,9R02-91116,9.0
58,0.023902,12_left,2_right,County of Sacramento Municipal Services Agency...,9R02-90216,Alpine County,9R02-91116,9.0
258,0.023902,28_left,172_right,Palo Verde Valley Transit Agency,9R02-99454,Shasta County,9R02-99438,
108,0.023902,16_left,2_right,"City of Dixon, dba: Readi-Ride",9R02-91041,Alpine County,9R02-91116,9.0
360,0.023902,37_left,2_right,Modoc Transportation Agency,9R02-91008,Alpine County,9R02-91116,9.0
354,0.093507,32_left,45_right,City of Dinuba,9R02-91040,City of Dinuba,9R02-91040,93.0


In [266]:
#cut off is City of Shafter dba Shafter Dial-A-Ride
vehicles_organization2 = vehicles_organization2.loc[vehicles_organization2['best_match_score'] > 0.09]

In [267]:
len(vehicles_organization2)

37

In [260]:
#full_join(vehicles_crosswalk, organizations_crosswalk, on = "ntd_id")
test_join = vehicles_crosswalk.merge(organizations_crosswalk, on="ntd_id", how="outer", indicator=True)
test_join>>count(_._merge)

Unnamed: 0,_merge,n
0,left_only,9
1,right_only,155
2,both,44


In [263]:
test_join>>filter(_._merge=='left_only')

Unnamed: 0,agency_x,ntd_id,agency_y,itp_id,_merge
7,San Benito County LTA,9R02-91009,,,left_only
12,County of Sacramento Municipal Services Agency...,9R02-90216,,,left_only
16,"City of Dixon, dba: Readi-Ride",9R02-91041,,,left_only
21,"Mariposa County Transit, dba: Mari-Go",9R02-91082,,,left_only
27,Calaveras Transit Agency,9R02-99442,,,left_only
28,Palo Verde Valley Transit Agency,9R02-99454,,,left_only
29,"City of Ridgecrest, dba: Ridgerunner",9R02-91006,,,left_only
38,Modoc Transportation Agency,9R02-91008,,,left_only
50,"Wasco, City of",9R02-99426,,,left_only


In [265]:
vehicles_crosswalk>>filter(_.agency.str.contains("City of Dixon"))

Unnamed: 0,agency,ntd_id
5149,"City of Dixon, dba: Readi-Ride",9R02-91041


In [264]:
organizations_crosswalk>>filter(_.agency.str.contains('City of Dixon'))

Unnamed: 0,agency,ntd_id,itp_id
156,City of Dixon,9R02-91040,94.0


In [269]:
test_crosswalk_both = (test_join>>filter(_._merge=='both')>>select(_.agency_x, _.ntd_id, _.itp_id))

In [271]:
len(test_crosswalk_both)

44

In [272]:
test_crosswalk_both.to_parquet("test_crosswalk_both.parquet")

In [273]:
testpq = pd.read_parquet("test_crosswalk_both.parquet")

In [275]:
len(testpq)

44

### Filtering out for bad match scores (stuff lower than 0.02) & clearly incorrect agencies with 'high' match scores.

In [182]:
#stuff to filter out before cutting off at scores below 0.02
subset = ['Plumas County Transportation Commission', 'Calaveras Transit Agency', 'Alpine County Local Transportation Commission','Kern Regional Transit','City of Dixon, dba: Readi-Ride','Kern Regional Transit']

In [183]:
#filter out a bunch of stuff all at once
vehicles_organization2 = vehicles_organization[~vehicles_organization.agency_left.isin(subset)]

In [199]:
#cut off is City of Shafter dba Shafter Dial-A-Ride
vehicles_organization2 = vehicles_organization2.loc[vehicles_organization2['best_match_score'] > 0.02]

In [185]:
vehicles_organization2.sort_values('best_match_score')

Unnamed: 0,best_match_score,__id_left,__id_right,agency_left,ntd_id_left,agency_right,ntd_id_right,itp_id
304,0.036304,19_left,97_right,"City of Shafter, dba: Shafter Dial-A-Ride",9R02-91120,City of Shafter,9R02-91120,303.0
305,0.099742,20_left,136_right,"Madera County, dba: Madera County Connection T...",9R02-91005,Madera County,9R02-91005,188.0
254,0.11753,18_left,51_right,"City of Eureka, dba: Eureka Transit Service",9R02-91093,City of Eureka,9R02-91093,108.0
57,0.14435,9_left,144_right,Nevada County Transit Services,9R02-91095,Nevada County,9R02-91095,221.0
420,0.157668,23_left,39_right,"City of Corcoran, dba: Corcoran Area Transit",9R02-91002,City of Corcoran,9R02-91002,78.0
831,0.157668,50_left,79_right,"City of Needles, dba: Needles Area Transit",9R02-91020,City of Needles,9R02-91020,220.0
613,0.157668,41_left,102_right,"City of Taft, dba: Taft Area Transit",9R02-91066,City of Taft,9R02-91066,330.0
203,0.171887,17_left,174_right,"County of Siskiyou, dba: Siskiyou County Transit",9R02-91048,Siskiyou County,9R02-91048,83.0
667,0.252084,43_left,31_right,City of California City,9R02-91111,City of California City,9R02-91111,51.0
673,0.254328,46_left,35_right,"City of Chowchilla, dba: Chowchilla Area Transit",9R02-91071,City of Chowchilla,9R02-91071,65.0


In [216]:
len(vehicles_organization2)

38

### List of agencies that are totally wrong
* Agencies in this list:
1. ['Plumas County Transportation Commission', 
  2. 'Calaveras Transit Agency',
  3.'Alpine County Local Transportation Commission',
  4. 'Kern Regional Transit'
  5.'City of Dixon, dba: Readi-Ride',
  6.'Kern Regional Transit']
  7.'San Benito County LTA',
  8. 'County of Sacramento Municipal Services Agency Department of Transportation',
  9.'Mariposa County Transit, dba: Mari-Go',
 10. 'Palo Verde Valley Transit Agency',
 11. 'City of Ridgecrest, dba: Ridgerunner',
 12. 'Glenn Transit Service',
 13. 'Modoc Transportation Agency',
 14. 'City of Escalon, dba: eTrans',
 15. 'Wasco, City of'

In [203]:
vehicles_organizations_exclude = vehicles_organization.loc[vehicles_organization['best_match_score'] < 0.02]

In [218]:
vehicles_organizations_exclude.agency_left.tolist()

['San Benito County LTA',
 'County of Sacramento Municipal Services Agency Department of Transportation',
 'Mariposa County Transit, dba: Mari-Go',
 'Palo Verde Valley Transit Agency',
 'City of Ridgecrest, dba: Ridgerunner',
 'Glenn Transit Service',
 'Modoc Transportation Agency',
 'City of Escalon, dba: eTrans',
 'Wasco, City of']

### Prepping Crosswalk (Vehicles on L, GTFS on R using a L join) to look at 5311 Black Cat

In [186]:
#cleaning up
vehicles_organization3 = vehicles_organization2.drop(columns=['best_match_score', 'agency_right', 'ntd_id_right','__id_left','__id_right','best_match_score'])

In [187]:
#rename columns
vehicles_organization3 = vehicles_organization3.rename(columns = {'agency_left':'agency'})

In [219]:
len(vehicles_organization3)

38

In [200]:
vehicles_organization3.head(2)

Unnamed: 0,agency,ntd_id_left,itp_id
0,Fresno County Rural Transit Agency,9R02-91007,117.0
1,Eastern Sierra Transit Authority,9R02-91062,99.0


# 5311 Data Crosswalk Test with the stuff that matched between Cal ITP & NTD
* Comparing the agency names in 5311 against Vehicle Data to ensure everything is the same using fuzzy matcher https://pbpython.com/record-linking.html

In [189]:
left_on = ["organization_name"]
right_on = ["agency"]

### Looking at best match score...
* Only 35 matches....out of 88? 
* Either use dictionary or merge black cat back in
* Filter out for left only in organization crosswalk

In [191]:
blackcat_vehicles_calitp = fuzzymatcher.fuzzy_left_join(crosswalk_5311, vehicles_organization3, left_on, right_on)

In [None]:
#stuff to filter out before cutting off at scores...
subset2 = ['Plumas County Transportation Commission', 'Imperial County Transportation Commission', 'Glenn County Transportation Commission','Tehama County Transit Agency','Tuolumne County Transit Agency (TCTA)']

In [None]:
#filter out a bunch of stuff all at once
matched_results2 = blackcat_vehicles_calitp[~blackcat_vehicles_calitp.organization_name.isin(subset2)]

In [194]:
#filter out for anything with a score below 0.208 which cuts off at Eureka Transit Service
matched_results2 = matched_results2.loc[matched_results2['best_match_score'] > 0.06 ]

In [195]:
len(matched_results2)

35

In [196]:
matched_results2.sort_values('organization_name')

Unnamed: 0,best_match_score,__id_left,__id_right,organization_name,agency,ntd_id_left,itp_id
837,0.0879,77_left,11_right,Amador Transit,Amador Regional Transit System,9R02-91000,11.0
24,0.281294,4_left,29_right,City of Arcata,City of Arcata,9R02-91018,18.0
103,0.281294,12_left,19_right,City of Arvin,City of Arvin,9R02-91027,21.0
104,0.281294,13_left,23_right,City of Auburn,City of Auburn,9R02-91032,23.0
105,0.281294,14_left,33_right,City of California City,City of California City,9R02-91111,51.0
0,0.210125,0_left,35_right,City of Chowchilla,"City of Chowchilla, dba: Chowchilla Area Transit",9R02-91071,65.0
838,0.374528,78_left,18_right,City of Corcoran - Corcoran Area Transit,"City of Corcoran, dba: Corcoran Area Transit",9R02-91002,78.0
25,0.281294,5_left,24_right,City of Dinuba,City of Dinuba,9R02-91040,93.0
163,0.281294,18_left,30_right,City of Guadalupe,City of Guadalupe,9R02-91043,129.0
164,0.281294,19_left,37_right,City of McFarland,City of McFarland,9R02-91110,197.0


### Looking at the agencies that had super low match scores
* In addition to stuff in subset2 (5)
* 47 agencies were cut off due to super low scores. 


In [224]:
#filter out for anything with a score below 0.208 which cuts off at Eureka Transit Service
len(matched_results.loc[matched_results['best_match_score'] <  0.06 ])

47

In [225]:
matched_results.loc[matched_results['best_match_score'] <  0.06 ]

Unnamed: 0,best_match_score,__id_left,__id_right,organization_name,agency,ntd_id_left,itp_id
3,-0.042892,3_left,2_right,Yuba-Sutter Transit Authority,Lake Transit Authority,9R02-91053,159.0
27,-0.113757,7_left,21_right,Modoc Transportation Agency,Colusa County Transit Agency,9R02-91112,74.0
32,-0.1645,8_left,11_right,San Diego Metropolitan Transit System,Amador Regional Transit System,9R02-91000,11.0
55,-0.195796,10_left,13_right,Butte County Association of Governments/ Butte...,"County of Siskiyou, dba: Siskiyou County Transit",9R02-91048,83.0
93,-0.187705,11_left,9_right,Calaveras County Public Works,Tehama County,9R02-91089,334.0
106,-0.028341,15_left,19_right,City of Dixon,City of Arvin,9R02-91027,21.0
125,-0.028341,16_left,19_right,City of Escalon,City of Arvin,9R02-91027,21.0
144,-0.028341,17_left,19_right,City of Fairfield,City of Arvin,9R02-91027,21.0
166,-0.028341,21_left,19_right,City of Porterville,City of Arvin,9R02-91027,21.0
185,-0.028341,22_left,19_right,City of Ridgecrest,City of Arvin,9R02-91027,21.0


### Looking at joining 5311 Black Cat with NTD data directly...
* Fair number of right matches with low match scores...
* only 38 records with good match scores...

In [243]:
left_on = ["agency"]
right_on = ["organization_name"]

In [244]:
blackcat_ntd = fuzzymatcher.fuzzy_left_join(vehicles_crosswalk,crosswalk_5311, left_on, right_on)

In [235]:
blackcat_ntd.sort_values('best_match_score')

Unnamed: 0,best_match_score,__id_left,__id_right,organization_name,agency,ntd_id
562,-0.373641,55_left,11_right,Placer County Public Works (TART & PCT),Tehama County,9R02-91089
702,-0.342016,64_left,6_right,Santa Cruz Metropolitan Transit District,Tuolumne County Transit,9R02-91057
217,-0.31477,31_left,12_right,County of Los Angeles - Department of Public W...,County of Sacramento Municipal Services Agency...,9R02-90216
688,-0.241635,63_left,3_right,Santa Clara Valley Transportation Authority,Lake Transit Authority,9R02-91053
803,-0.238766,71_left,3_right,Transit Joint Powers Authority for Merced County,Lake Transit Authority,9R02-91053
899,-0.232455,79_left,0_right,Fresno Council of Governments,Fresno County Rural Transit Agency,9R02-91007
318,-0.22956,35_left,12_right,County of Shasta Department of Public Works,County of Sacramento Municipal Services Agency...,9R02-90216
487,-0.223472,50_left,6_right,Monterey-Salinas Transit,Tuolumne County Transit,9R02-91057
762,-0.214516,67_left,6_right,Stanislaus County Public Works - Transit Division,Tuolumne County Transit,9R02-91057
924,-0.18732,80_left,34_right,Klamath Trinity Non-Emergency Transportation​,Trinity County,9R02-91035


In [245]:
len(blackcat_ntd.loc[blackcat_ntd['best_match_score'] >  0.17 ])

38