In [1]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *
import intake
import data_prep
import fuzzymatcher
pd.options.display.max_rows = 250



# Script Testing
## Vehicles - Help
* Can't get grouping to work as an inner function

In [2]:
df = data_prep.load_grantprojects()
vehicles = data_prep.load_vehiclesdata()
organizations= data_prep.load_organizations_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [3]:
# Grouping vehicle types together: not working in function
Automobiles = ['Automobile','Automobiles (Service)','Sports Utility Vehicle']
Bus = ['Bus','Over-the-road Bus']
Vans = ['Van','Trucks and other Rubber Tire Vehicles (Service)','Minivan','Cutaway',]

In [4]:
def replace_modes(row):
        if row.vehicle_type in Automobiles:
            return "Rail"
        elif row.vehicle_type in Bus:
            return "Bus"
        else:
            return "Vans"
vehicles["vehicle_groups"] = vehicles.apply(lambda x: replace_modes(x), axis=1)

# Basic Facts about our data

### Vehicles 

In [5]:
f'{vehicles.agency.nunique()} total unique agencies'

'218 total unique agencies'

In [6]:
#subset for crosswalk
vehicles_crosswalk = vehicles[['agency','ntd_id']]
#drop duplicates for crosswalk
vehicles_crosswalk = vehicles_crosswalk.drop_duplicates()

### Black Cat data for only 5311 programs
* Most funding program is just section 5311
* What differentiates them?

In [7]:
df.funding_program.value_counts()

Section 5311       590
Section 5311(f)    109
5311(f) Cont        41
CMAQ (FTA 5311)     31
5311(f) Round 2     27
Name: funding_program, dtype: int64

In [8]:
f'a total of {df.organization_name.nunique()} unique agencies after filtering for 5311'

'a total of 88 unique agencies after filtering for 5311'

In [9]:
#subset for crosswalk
crosswalk_5311 = df[['organization_name']]
crosswalk_5311 = crosswalk_5311.drop_duplicates()

### GTFS Data
* Has NTD ID, ITP ID, and GTFS information
* After deleting all rows that do not have NTP IDs, rows went from 781 rows to 199
* Not every agency has a ITP id.
* So only 199 agencies have NTD IDS in the organizations data set.

In [10]:
organizations2 = organizations.dropna(subset=['ntd_id'])

In [11]:
organizations2.isna().sum()

agency                                    0
ntd_id                                    0
itp_id                                   12
gtfs_schedule_status                     18
#_services_w__complete_rt_status          0
#_fixed_route_services_w__static_gtfs     0
complete_static_gtfs_coverage__1=yes_     0
complete_rt_coverage                      0
>=1_gtfs_feed_for_any_service__1=yes_     0
>=_1_complete_rt_set__1=yes_              0
dtype: int64

In [12]:
f'a total of {organizations.agency.nunique()} unique agencies'

'a total of 781 unique agencies'

In [13]:
f'a total of {organizations2.agency.nunique()} unique agencies after only keeping agencies with NTD IDS'

'a total of 199 unique agencies after only keeping agencies with NTD IDS'

In [14]:
#subset for crosswalk
organizations_crosswalk = organizations2[['agency','ntd_id', 'itp_id']]

# Crosswalk Cal ITP with NTD
### Left join on Cal ITP's "organizations data set" with GTFS with vehicles
* Using left join, organization crosswalk on the L. Vehicles on the R. 
* Only 1/2 (26) of agencies in vehicles (total 52) appears in Cal ITP
* 173 of the records in organizations have Cal ITP ids but aren't in the vehicles dataset, which makes sense since these are tons of non-rural reporters in the organizations data set

In [15]:
df_joined1 = organizations_crosswalk.merge(vehicles_crosswalk.drop_duplicates(), on=['agency', 'ntd_id'],  how='left', validate = "1:1", indicator=True)

In [16]:
df_joined1._merge.value_counts()

left_only     173
both           26
right_only      0
Name: _merge, dtype: int64

### Filtering out for only agencies with both...

In [17]:
df_joined_both = df_joined1.loc[df_joined1['_merge'] == 'both']
df_joined_both=df_joined_both.drop_duplicates()

### Filtering out for agencies that appeared in organizations but not vehicles...

In [18]:
df_organizations_only = df_joined1.loc[df_joined1['_merge'] == 'left_only']

In [19]:
df_organizations_only

Unnamed: 0,agency,ntd_id,itp_id,_merge
0,Access Services,90157,1.0,left_only
1,Alameda-Contra Costa Transit District,90014,4.0,left_only
2,Alpine County,9R02-91116,9.0,left_only
4,Anaheim Transportation Network,90211,14.0,left_only
5,Antelope Valley Transit Authority,90121,16.0,left_only
6,Bishop Paiute Tribe,99268,40.0,left_only
7,Blue Lake Rancheria,99292,42.0,left_only
8,Butte County Association of Governments,90208,47.0,left_only
9,Calaveras County,9R02-91063,50.0,left_only
10,California Department of Transportation,9R02,,left_only


## Joining using vehicles on the left this time
* Still getting a strange even split of 26 agencies are only found in vehicles, 26 are found in both?
* The agencies in df_joined_both (organizations on the L and vehicles on the R) and df_joined_both2 (vehicles on the L and organizations on the R) match.

In [20]:
df_joined2 = vehicles_crosswalk.merge(organizations_crosswalk.drop_duplicates(), on=['agency','ntd_id'], validate = "m:1", how='left', indicator=True)
df_joined2._merge.value_counts()

left_only     192
both           26
right_only      0
Name: _merge, dtype: int64

In [21]:
df_joined_both2 = df_joined2.loc[df_joined2['_merge'] == 'both']

In [22]:
df_joined_both2.sort_values('agency')

Unnamed: 0,agency,ntd_id,itp_id,_merge
125,Amador Regional Transit System,9R02-91000,11.0,both
184,City of Arcata,9R02-91018,18.0,both
152,City of Arvin,9R02-91027,21.0,both
162,City of Auburn,9R02-91032,23.0,both
188,City of California City,9R02-91111,51.0,both
163,City of Dinuba,9R02-91040,93.0,both
185,City of Guadalupe,9R02-91043,129.0,both
209,City of McFarland,9R02-91110,197.0,both
200,City of Ojai,9R02-91058,231.0,both
161,City of Solvang,9R02-91028,312.0,both


## Going to try to use fuzzy matcher on this..
* 38 matches...out of the 52 rural agencies!

* Now I can look at NTD ID to make sure these really are the same agencies.

In [23]:
left_on = ["ntd_id"]
right_on = ["ntd_id"]

In [24]:
vehicles_organization = fuzzymatcher.fuzzy_left_join(vehicles_crosswalk, organizations_crosswalk, left_on, right_on)

In [25]:
vehicles_organization.sort_values('best_match_score')

Unnamed: 0,best_match_score,__id_left,__id_right,agency_left,ntd_id_left,agency_right,ntd_id_right,itp_id
573,0.023902,204_left,172_right,"Wasco, City of",9R02-99426,Shasta County,9R02-99438,
436,0.023902,160_left,2_right,"City of Ridgecrest, dba: Ridgerunner",9R02-91006,Alpine County,9R02-91116,9.0
166,0.023902,120_left,2_right,County of Sacramento Municipal Services Agency...,9R02-90216,Alpine County,9R02-91116,9.0
386,0.023902,156_left,172_right,Palo Verde Valley Transit Agency,9R02-99454,Shasta County,9R02-99438,
339,0.023902,155_left,172_right,Calaveras Transit Agency,9R02-99442,Shasta County,9R02-99438,
505,0.023902,182_left,2_right,Modoc Transportation Agency,9R02-91008,Alpine County,9R02-91116,9.0
281,0.023902,143_left,2_right,"Mariposa County Transit, dba: Mari-Go",9R02-91082,Alpine County,9R02-91116,9.0
227,0.023902,135_left,2_right,"City of Dixon, dba: Readi-Ride",9R02-91041,Alpine County,9R02-91116,9.0
91,0.023902,91_left,2_right,San Benito County LTA,9R02-91009,Alpine County,9R02-91116,9.0
149,0.079639,103_left,33_right,City of Carson,90258,City of Carson,90258,57.0


In [26]:
#cut off is City of Shafter dba Shafter Dial-A-Ride
#vehicles_organization2 = vehicles_organization2.loc[vehicles_organization2['best_match_score'] > 0.09]

In [27]:
#len(vehicles_organization2)

In [28]:
#full_join(vehicles_crosswalk, organizations_crosswalk, on = "ntd_id")
test_join = vehicles_crosswalk.merge(organizations_crosswalk, on="ntd_id", how="outer", indicator=True)
test_join>>count(_._merge)

Unnamed: 0,_merge,n
0,left_only,174
1,right_only,154
2,both,45


In [29]:
test_join>>filter(_._merge=='left_only')

Unnamed: 0,agency_x,ntd_id,agency_y,itp_id,_merge
0,Los Angeles County Metropolitan Transportation...,90154,,,left_only
1,Orange County Transportation Authority,90036,,,left_only
2,Access Services,90157,,,left_only
3,"City and County of San Francisco, dba: San Fra...",90015,,,left_only
4,San Diego Metropolitan Transit System,90026,,,left_only
5,"California Vanpool Authority, dba: CalVans",90230,,,left_only
6,"Alameda-Contra Costa Transit District, dba: AC...",90014,,,left_only
7,San Diego Association of Governments,90095,,,left_only
8,Santa Clara Valley Transportation Authority,90013,,,left_only
9,San Francisco Bay Area Rapid Transit District,90003,,,left_only


In [30]:
vehicles_crosswalk>>filter(_.agency.str.contains("City of Dixon"))

Unnamed: 0,agency,ntd_id
5149,"City of Dixon, dba: Readi-Ride",9R02-91041


In [31]:
organizations_crosswalk>>filter(_.agency.str.contains('City of Dixon'))

Unnamed: 0,agency,ntd_id,itp_id
156,City of Dixon,9R02-91040,94.0


In [32]:
test_crosswalk_both = (test_join>>filter(_._merge=='both')>>select(_.agency_x, _.ntd_id, _.itp_id))

In [33]:
len(test_crosswalk_both)

45

In [34]:
test_crosswalk_both.to_parquet("test_crosswalk_both.parquet")

In [35]:
testpq = pd.read_parquet("test_crosswalk_both.parquet")

In [36]:
len(testpq)

45

### Filtering out for bad match scores (stuff lower than 0.02) & clearly incorrect agencies with 'high' match scores.

In [37]:
#stuff to filter out before cutting off at scores below 0.02
subset = ['Plumas County Transportation Commission', 'Calaveras Transit Agency', 'Alpine County Local Transportation Commission','Kern Regional Transit','City of Dixon, dba: Readi-Ride','Kern Regional Transit']

In [38]:
#filter out a bunch of stuff all at once
vehicles_organization2 = vehicles_organization[~vehicles_organization.agency_left.isin(subset)]

In [39]:
#cut off is City of Shafter dba Shafter Dial-A-Ride
vehicles_organization2 = vehicles_organization2.loc[vehicles_organization2['best_match_score'] > 0.02]

In [40]:
vehicles_organization2.sort_values('best_match_score')

Unnamed: 0,best_match_score,__id_left,__id_right,agency_left,ntd_id_left,agency_right,ntd_id_right,itp_id
573,0.023902,204_left,172_right,"Wasco, City of",9R02-99426,Shasta County,9R02-99438,
281,0.023902,143_left,2_right,"Mariposa County Transit, dba: Mari-Go",9R02-91082,Alpine County,9R02-91116,9.0
91,0.023902,91_left,2_right,San Benito County LTA,9R02-91009,Alpine County,9R02-91116,9.0
436,0.023902,160_left,2_right,"City of Ridgecrest, dba: Ridgerunner",9R02-91006,Alpine County,9R02-91116,9.0
386,0.023902,156_left,172_right,Palo Verde Valley Transit Agency,9R02-99454,Shasta County,9R02-99438,
166,0.023902,120_left,2_right,County of Sacramento Municipal Services Agency...,9R02-90216,Alpine County,9R02-91116,9.0
505,0.023902,182_left,2_right,Modoc Transportation Agency,9R02-91008,Alpine County,9R02-91116,9.0
0,0.079639,0_left,135_right,Los Angeles County Metropolitan Transportation...,90154,Los Angeles County Metropolitan Transportation...,90154,182.0
161,0.079639,115_left,85_right,City of Porterville,90198,City of Porterville,90198,256.0
162,0.079639,116_left,49_right,"City of El Monte, dba: City of El Monte Transp...",90265,City of El Monte,90265,102.0


In [41]:
len(vehicles_organization2)

189

### List of agencies that are totally wrong
* Agencies in this list:
1. ['Plumas County Transportation Commission', 
  2. 'Calaveras Transit Agency',
  3.'Alpine County Local Transportation Commission',
  4. 'Kern Regional Transit'
  5.'City of Dixon, dba: Readi-Ride',
  6.'Kern Regional Transit']
  7.'San Benito County LTA',
  8. 'County of Sacramento Municipal Services Agency Department of Transportation',
  9.'Mariposa County Transit, dba: Mari-Go',
 10. 'Palo Verde Valley Transit Agency',
 11. 'City of Ridgecrest, dba: Ridgerunner',
 12. 'Glenn Transit Service',
 13. 'Modoc Transportation Agency',
 14. 'City of Escalon, dba: eTrans',
 15. 'Wasco, City of'

In [42]:
vehicles_organizations_exclude = vehicles_organization.loc[vehicles_organization['best_match_score'] < 0.02]

In [43]:
vehicles_organizations_exclude.agency_left.tolist()

[]

### Prepping Crosswalk (Vehicles on L, GTFS on R using a L join) to look at 5311 Black Cat

In [44]:
#cleaning up
vehicles_organization3 = vehicles_organization2.drop(columns=['best_match_score', 'agency_right', 'ntd_id_right','__id_left','__id_right','best_match_score'])

In [45]:
#rename columns
vehicles_organization3 = vehicles_organization3.rename(columns = {'agency_left':'agency'})

In [46]:
len(vehicles_organization3)

189

In [47]:
vehicles_organization3.head(2)

Unnamed: 0,agency,ntd_id_left,itp_id
0,Los Angeles County Metropolitan Transportation...,90154,182.0
1,Orange County Transportation Authority,90036,235.0


# 5311 Data Crosswalk Test with the stuff that matched between Cal ITP & NTD
* Comparing the agency names in 5311 against Vehicle Data to ensure everything is the same using fuzzy matcher https://pbpython.com/record-linking.html

In [48]:
left_on = ["organization_name"]
right_on = ["agency"]

### Looking at best match score...
* Only 35 matches....out of 88? 
* Either use dictionary or merge black cat back in
* Filter out for left only in organization crosswalk

In [49]:
blackcat_vehicles_calitp = fuzzymatcher.fuzzy_left_join(crosswalk_5311, vehicles_organization3, left_on, right_on)

In [50]:
#stuff to filter out before cutting off at scores...
subset2 = ['Plumas County Transportation Commission', 'Imperial County Transportation Commission', 'Glenn County Transportation Commission','Tehama County Transit Agency','Tuolumne County Transit Agency (TCTA)']

In [51]:
#filter out a bunch of stuff all at once
matched_results2 = blackcat_vehicles_calitp[~blackcat_vehicles_calitp.organization_name.isin(subset2)]

In [52]:
#filter out for anything with a score below 0.208 which cuts off at Eureka Transit Service
matched_results2 = matched_results2.loc[matched_results2['best_match_score'] > 0.06 ]

In [53]:
len(matched_results2)

71

In [54]:
matched_results2.sort_values('organization_name')

Unnamed: 0,best_match_score,__id_left,__id_right,organization_name,agency,ntd_id_left,itp_id
617,0.104681,77_left,114_right,Amador Transit,Amador Regional Transit System,9R02-91000,11.0
10,0.689259,10_left,51_right,Butte County Association of Governments/ Butte...,Butte County Association of Governments,90208,47.0
74,0.249936,11_left,188_right,Calaveras County Public Works,County of Shasta Department of Public Works,9R02-99438,
700,0.113967,84_left,15_right,Calaveras Transit Agency,Riverside Transit Agency,90031,269.0
4,0.3111,4_left,168_right,City of Arcata,City of Arcata,9R02-91018,18.0
76,0.327004,12_left,139_right,City of Arvin,City of Arvin,9R02-91027,21.0
77,0.327004,13_left,147_right,City of Auburn,City of Auburn,9R02-91032,23.0
78,0.275127,14_left,172_right,City of California City,City of California City,9R02-91111,51.0
0,0.264441,0_left,182_right,City of Chowchilla,"City of Chowchilla, dba: Chowchilla Area Transit",9R02-91071,65.0
619,0.460159,78_left,138_right,City of Corcoran - Corcoran Area Transit,"City of Corcoran, dba: Corcoran Area Transit",9R02-91002,78.0


### Looking at the agencies that had super low match scores
* In addition to stuff in subset2 (5)
* 47 agencies were cut off due to super low scores. 


In [55]:
#filter out for anything with a score below 0.208 which cuts off at Eureka Transit Service
len(matched_results.loc[matched_results['best_match_score'] <  0.06 ])

NameError: name 'matched_results' is not defined

In [None]:
matched_results.loc[matched_results['best_match_score'] <  0.06 ]

### Looking at joining 5311 Black Cat with NTD data directly...
* Fair number of right matches with low match scores...
* only 38 records with good match scores...

In [None]:
left_on = ["agency"]
right_on = ["organization_name"]

In [None]:
blackcat_ntd = fuzzymatcher.fuzzy_left_join(vehicles_crosswalk,crosswalk_5311, left_on, right_on)

In [None]:
blackcat_ntd.sort_values('best_match_score')

In [None]:
len(blackcat_ntd.loc[blackcat_ntd['best_match_score'] >  0.17 ])