In [1]:
import pandas as pd
import numpy as np
from calitp import to_snakecase

from siuba import *

from IPython.display import display, Markdown

In [2]:
df = to_snakecase(pd.read_csv(
    'gs://calitp-analytics-data/data-analyses/'
    'gtfs_compliance/organizations-AllOrganizations.csv'))

In [3]:
df.columns

Index(['name', 'organization_type', 'roles', 'record_creation_time', 'ntp_id',
       'itp_id', 'opm_id_drmt', 'dotid', 'brand', 'alias', 'details',
       'website', 'parent_organization', 'administrating_organization',
       'mobility_services_managed', 'missing_static',
       'funding_sources_for_managed_transportation',
       'mobility_services_operated', 'gtfs_datasets_produced',
       'service_type__from_mobility_services_managed_',
       'currently_operating__from_mobility_services_managed_',
       'currently_operating__from_mobility_services_operated_',
       'service_type__from_mobility_services_operated_', 'headquarters_place',
       'funding_programs', 'total_voms__ntd_', 'service_area_sq_miles__ntd_',
       'service_area_population__ntd_', 'caltrans_district', 'mpo_rtpa',
       'planning_authority', 'tracking_category', 'reporting_category',
       'assist_category', 'eligibility_programs', 'gtfs_datasets',
       'gtfs_dataset__from_mobility_services_managed_', '

In [4]:
managed_service_cols = [
    'mobility_services_managed', 
    # Do you need to keep info such as "check 3 out of 3"?
    #'currently_operating__from_mobility_services_managed_',
    'service_type__from_mobility_services_managed_',
]

operated_service_cols = [
    'mobility_services_operated', 
    #'currently_operating__from_mobility_services_operated_',
    'service_type__from_mobility_services_operated_', 
]

In [5]:
def make_long(df, keep_cols = [], category_name = "managed"):
    df1 = df[["name", "gtfs_schedule_status"] + keep_cols]
    
    for col_name in ["mobility_services", "service_type"]:
        df1 = df1.rename(columns=lambda c: col_name 
                        if c.startswith(col_name) else c)
    
    df1 = df1[["name", "gtfs_schedule_status", "mobility_services", "service_type"]]
    
    df2 = df1.assign(
        mobility_services = (df1.mobility_services.fillna("None")
                         .apply(lambda x: x.split(','))
                        )
    )
    
    # Cannot add service_type to explode
    # Service types do not match mobility_services (1 service can provide both flex and fixed route)
    # That's fine, maybe service type is not too important to keep anyway
    df3 = df2.explode("mobility_services")
    
    # New variable to track the category name. 
    # Within mobility services, there are 2 sub-categories, managed / operated
    df3 = df3.assign(
        category = category_name
    )
    
    return df3

In [6]:
managed = make_long(df, keep_cols = managed_service_cols, category_name = "managed")
managed

Unnamed: 0,name,gtfs_schedule_status,mobility_services,service_type,category
0,A-Paratransit,,,,managed
1,ABC Shuttle,needed,ABC Airport Shuttle,on-demand,managed
2,Able Inc.,,Able Inc.,on-demand,managed
3,Abrazar Inc.,needed,Abrazar,NEMT,managed
4,Access Services,needed,LA Access Services,ADA paratransit,managed
...,...,...,...,...,...
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Sacramento Express,"fixed-route,on-demand,deviated fixed-route,res...",managed
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Yuba College Sutter Campus Shuttle,"fixed-route,on-demand,deviated fixed-route,res...",managed
850,Yuma County Intergovernmental Public Transport...,ok,Yuma County Area Transit,fixed-route,managed
851,Yurok Tribe,needed,Yurok Tribe Transit Service,"on-demand,reservations",managed


In [7]:
operated = make_long(df, keep_cols = operated_service_cols, category_name = "operated")
operated

Unnamed: 0,name,gtfs_schedule_status,mobility_services,service_type,category
0,A-Paratransit,,East Bay Paratransit,ADA paratransit,operated
1,ABC Shuttle,needed,ABC Airport Shuttle,on-demand,operated
2,Able Inc.,,Able Inc.,on-demand,operated
3,Abrazar Inc.,needed,Abrazar,NEMT,operated
4,Access Services,needed,LA Access Services,ADA paratransit,operated
...,...,...,...,...,...
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Yuba-Sutter Dial-A-Ride,"fixed-route,on-demand,deviated fixed-route,res...",operated
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Sacramento Express,"fixed-route,on-demand,deviated fixed-route,res...",operated
850,Yuma County Intergovernmental Public Transport...,ok,Yuma County Area Transit,fixed-route,operated
851,Yurok Tribe,needed,Yurok Tribe Transit Service,"on-demand,reservations",operated


In [8]:
operated[operated.name.str.contains("Anaheim")]

Unnamed: 0,name,gtfs_schedule_status,mobility_services,service_type,category
25,Anaheim Transportation Network,"ok,needed,needed",Anaheim Resort Transportation,"fixed-route,on-demand,ADA paratransit",operated
25,Anaheim Transportation Network,"ok,needed,needed",FRAN,"fixed-route,on-demand,ADA paratransit",operated
25,Anaheim Transportation Network,"ok,needed,needed",Anaheim Transportation Network Paratransit,"fixed-route,on-demand,ADA paratransit",operated
106,City of Anaheim,needed,Senior Wheels,"reservations,on-demand",operated


In [9]:
operated[operated.name.str.contains("Yuba")]

Unnamed: 0,name,gtfs_schedule_status,mobility_services,service_type,category
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Yuba-Sutter Transit,"fixed-route,on-demand,deviated fixed-route,res...",operated
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Yuba-Sutter Rural Transit,"fixed-route,on-demand,deviated fixed-route,res...",operated
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Yuba-Sutter Dial-A-Ride,"fixed-route,on-demand,deviated fixed-route,res...",operated
849,Yuba-Sutter Transit Authority,"ok,needed,needed",Sacramento Express,"fixed-route,on-demand,deviated fixed-route,res...",operated


In [10]:
# They can just produce 1 GTFS dataset, but it seems like 
# gtfs_schedule_status also doesn't move closely with
# service_type_mobility_services_managed or mobility_services_managed
# Leave unexploded

df[df.name.str.contains("Anaheim")][
    ["name", "gtfs_schedule_status", "gtfs_dataset__from_mobility_services_managed_"]]

Unnamed: 0,name,gtfs_schedule_status,gtfs_dataset__from_mobility_services_managed_
25,Anaheim Transportation Network,"ok,needed,needed",Anaheim Resort Schedule
106,City of Anaheim,needed,
