# 5311 and 5310 Applicant Payment First Look
* An agency in this analysis is a recipient of 5311/5310/or both funds

In [1]:
import calitp.magics
import pandas as pd
import utils
from calitp import *
from calitp.tables import tbl
from siuba import *

pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)



## Black Cat

In [2]:
# Read in Black Cat: 5311 and 5310 recipients with projects in past 2 years
df_bc = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/5311-5310/5311_5310_applicants.parquet",
    engine="auto",
)

In [3]:
# Clean up organization names
df_bc = utils.organization_cleaning(df_bc, "organization_name")

In [4]:
df_bc["organization_name"].sort_values().unique().tolist()

['ARC Imperial Valley',
 'Alegria Community Living',
 'Alpine County Community Development',
 'Amador Transit',
 'Angel View',
 'Area 1 Agency on Aging',
 'Asian Community Center of Sacramento Valley',
 'Bay Area Outreach & Recreation Program',
 'Butte County Association of Governments Butte Regional Transit',
 'Calaveras Transit Agency',
 'Care-A-Van Transit Systems',
 'Casa Allegra Community Services',
 'Catholic Charities of the Diocese of Stockton',
 "Center for Elders' Independence",
 'Choice in Aging',
 'City of Arcata',
 'City of Arvin',
 'City of Auburn',
 'City of California City',
 'City of Chowchilla',
 'City of Corcoran - Corcoran Area Transit',
 'City of Dinuba',
 'City of Dixon',
 'City of Escalon',
 'City of Guadalupe',
 'City of Lafayette: Lamorinda Sprit Van Program',
 'City of McFarland',
 'City of Needles',
 'City of Ojai',
 'City of Porterville',
 'City of Ridgecrest',
 'City of Rio Vista',
 'City of Santa Maria',
 'City of Shafter',
 'City of Solvang',
 'City of Ta

## Airtable - Transit Stacks, Service Components

In [5]:
%%sql -o df_service_components
SELECT 
  service_name,
  product_name,
  component_name,
  service_key
FROM cal-itp-data-infra.mart_transit_database.dim_service_components


Unnamed: 0,service_name,product_name,component_name,service_key
0,Nevada County Connects,Swiftly Metronome,,recgbVdL7La5rdtjn
1,Lassen Rural Bus,The Routing Company: Unspecified,On-Demand Dispatch,recnOfaSplJ7hsriw
2,Caltrain,ARINC PADS,AVL Software,recFFwRfkXJ2BaUp5
3,Caltrain Shuttles,ARINC PADS,AVL Software,reciQaJ0Ki731oW3k
4,Clovis Transit Stageline,Zonar (Unspecified Model),AVL Software,recGYl7CoYON7V2o7
5,Clovis Transit Roundup,Zonar (Unspecified Model),AVL Software,recN2hQWPIGxvlNZs
6,Muni Metro,Conduent (Unspecified Model),AVL Software,recl5UWS5A6LS8cbP
7,NCTD Sprinter,Conduent (Unspecified Model),AVL Software,rec6leA1oWCXp58pt
8,NCTD BREEZE,Conduent (Unspecified Model),AVL Software,recEh6oiAEbuqi6EP
9,NCTD Coaster,Conduent (Unspecified Model),AVL Software,recR6xLGedA7z4YUF


In [6]:
# df_service_components["service_components_service_name"].sort_values().unique()

In [7]:
# Add prefix
df_service_components = df_service_components.add_prefix("service_components_")

In [8]:
# Clean organization name
df_service_components = utils.organization_cleaning(
    df_service_components, "service_components_service_name"
)

In [9]:
# df_service_components["service_components_service_name"].sort_values().unique().tolist()

In [10]:
# Clean up Transit Stacks names to match blackcat
df_service_components["service_components_service_name"] = df_service_components[
    "service_components_service_name"
].replace(
    {
        "Solano Express": "Solano Tranportation Authority",
        "Santa Cruz METRO": "Santa Cruz Metropolitan Transit District",
        "MTS Bus": "San Diego Metropolitan Transit System",
        "VTA Bus": "Santa Clara Valley Transportation Authority",
        "Redwood Coast Transit": "Redwood Coast Transit Authority",
        "Mountain Transit": "Mountain Area Regional Transit Authority",
        "Marin Transit": "Marin County Transit District",
        "Visalia Transit": "City of Visalia",
        "Lassen Rural Bus": "Lassen Transit Service Agency",
        "Ojai Trolley": "City of Ojai",
        "Siskiyou Transit and General Express": "County of Siskiyou",
        "Madera County Connection": "Madera County",
        "Tulare County Area Transit": "Tulare County Regional Transportation Agency",
        "Trinity County Department of Transportation": "Trinity County Department of Transportation",
        "Santa Maria Area Transit": "City of Santa Maria",
        "Tuolumne County Transit Agency": "Tuolumne County Transit Agency",
        "OmniTrans": "Omnitrans",
        "Eastern Sierra Transit Authority Community Routes": "Eastern Sierra Transit Authority",
        "Fresno County Rural Transit": "Fresno County Rural Transit Agency",
        "El Dorado Transit": "El Dorado County Transit Authority",
        "Palos Verdes Peninsula Transit Authority": "Palo Verde Valley Transit Agency",
        "Kern Transit": "Kern Regional Transit",
        "Riverside Transit Agency": "Riverside Transit",
        "Tuolumne County Transit": "Tuolumne County Transit Agency",
        "Yuba-Sutter Transit": "Yuba-Sutter Transit Authority",
        "County of Sonoma": "Sonoma County Transit",
        "SunLine Transit": "Sunline Transit Agency",
        "Trinity Transit": "Trinity County Department of Transportation",
        "Yuba-Sutter Transit": "Yuba-Sutter Transit Authority",
    }
)

#### To make things a little clearer: have 2 columns for fare related products/comps and 2 columns for CAD/AVL related products/comps

In [11]:
# List of fare
fare_comps = [
    "Mobile ticketing",
    "Cash Farebox",
    "Fare card system",
    "Ticket Vending Machines",
    "Contactless Payment Validators",
    "Payment processor",
]

In [12]:
# Subset dataframe for fare related components
# This function grabs all the different components/products a company purchases and places it on one line.
df_fare_comps = utils.service_comps_summarize(df_service_components, fare_comps)

In [13]:
df_fare_comps = df_fare_comps.rename(
    columns={
        "service_components_component_name": "service_components_fare_component",
        "service_components_product_name": "service_components_fare_product",
    }
)

In [14]:
# Subset out for CAD/AVL
cad_avl_comps = [
    "AVL Software",
    "Location Sensors",
    "AVL On-board Computer",
]

In [15]:
# Subset dataframe for cad/avl
df_cad_avl = utils.service_comps_summarize(df_service_components, cad_avl_comps)

In [16]:
df_cad_avl = df_cad_avl.rename(
    columns={
        "service_components_component_name": "service_components_AVL_CAD_component",
        "service_components_product_name": "service_components_AVL_CAD_product",
    }
)

In [17]:
df_cad_avl["service_components_service_name"].nunique(), df_fare_comps[
    "service_components_service_name"
].nunique()

(111, 132)

### Merge Fare Components with Black Cat: M1

In [18]:
# Merge transit stacks with BC
m1 = pd.merge(
    df_bc,
    df_fare_comps,
    how="left",
    left_on=["organization_name"],
    right_on=["service_components_service_name"],
)

In [19]:
m1.shape

(177, 5)

In [20]:
len(m1)

177

In [21]:
m1 = m1.drop(columns=["service_components_service_name"])

### Merge AVL/CAD Components 

In [22]:
# Merge BC with fare info along with AVL/CAD info
m1 = pd.merge(
    m1,
    df_cad_avl,
    how="left",
    left_on=["organization_name"],
    right_on=["service_components_service_name"],
)

In [23]:
# Merge in service keys which got lost somewhere...
m1 = pd.merge(
    m1,
    df_service_components,
    how="left",
    on="service_components_service_name",
)

In [24]:
m1 = m1.drop(
    columns=[
        "service_components_service_name",
        "service_components_component_name",
        "service_components_product_name",
    ]
)

In [25]:
m1 = m1.drop_duplicates()

In [26]:
m1.shape

(177, 7)

## Airtable - Fare Systems 
* Using CSV for now since the table isn't in the warehouse yet.

In [27]:
df_fare = to_snakecase(
    pd.read_csv(
        "gs://calitp-analytics-data/data-analyses/5311-5310/fare systems-Grid view.csv"
    )
)

In [28]:
"""
%%sql -o df_fare
SELECT 
  service_name,
  product_name,
  component_name
FROM cal-itp-data-infra.mart_transit_database.dim_service_components
"""

'\n%%sql -o df_fare\nSELECT \n  service_name,\n  product_name,\n  component_name\nFROM cal-itp-data-infra.mart_transit_database.dim_service_components\n'

In [29]:
# Only grab columns I'm interested in
cols_wanted = [
    "fare_system",
    "electronic_fare_program",
    "ticket_media",
    "payment_accepted",
]

In [30]:
# Subset dataframe
df_fare2 = df_fare[cols_wanted]

In [31]:
# Keep only the rows with at least 2 non-NA values.
df_fare3 = df_fare2.dropna(thresh=2)

In [32]:
# Compare the two dataframes
df_fare2.shape, df_fare3.shape

((394, 4), (166, 4))

In [33]:
# Add prefix
df_fare3 = df_fare3.add_prefix("fare_systems_")

In [34]:
# Clean organization name
df_fare3 = utils.organization_cleaning(df_fare3, "fare_systems_fare_system")

In [35]:
# df_fare3["fare_systems_fare_system"].sort_values().unique().tolist()

In [36]:
# Clean up names to match Fare Systems
df_fare3["fare_systems_fare_system"] = df_fare3["fare_systems_fare_system"].replace(
    {
        "Arcata and Mad River Transit System": "City of Arcata",
        "Arvin Transit": "City of Arvin",
        "Auburn Transit": "City of Arvin",
        "Guadalupe Flyer": "City of Guadalupe",
        "Plumas Transit Systems": "Plumas County Transportation Commission",
        "Ojai Trolley": "City of Ojai",
        "Needles Area Transit": "City of Needles",
        "Porterville Transit": "City of Porterville",
        "Ridgecrest Transit": "City of Ridgecrest",
        "Rio Vista Delta Breeze": "City of Rio Vista",
        "Madera County Connection": "Madera County",
        "SolTrans": "Solano Tranportation Authority",
        "Siskiyou Transit and General Express": "County of Siskiyou",
        "Mountain Transit": "Mountain Area Regional Transit Authority",
        "Plumas Transit Systems": "Plumas County Transportation Commission",
        "San Benito County Express": "San Benito County Local Transportation Authority",
        "Sage Stage": "Modoc Transportation Agency",
        "Vine Transit": "Napa Valley Transportation Authority",
        "San Diego Metropolitan Transit System": "San Diego Metropolitan Transit System",
        "Stanislaus Regional Transit": "Stanislaus Regional Transit Authority",
        "Amador Regional Transit System": "Amador Transit",
        "Calaveras Transit Agency": "Calaveras Transit",
        "Eastern Sierra Transit Authority": "Eastern Sierra Transit Authority Community Routes",
        "Fresno County Rural Transit Agency": "Fresno County Rural Transit",
        "Marin Transit": "Marin County Transit District",
        "Redding Area Bus Authority": "Redwood Coast Transit Authority",
        "SunLine Transit Agency": "Sunline Transit Agency",
        "Victor Valley Transit": "Victor Valley Transit Authority",
        "Yolobus": "Yolo County Transportation District",
        "Corona Cruiser": "City of Corcoran - Corcoran Area Transit",
        "Dinuba Area Regional Transit": "City of Dinuba",
        "Eastern Sierra Transit Authority Community Routes": "El Dorado County Transit Authority",
        "El Dorado Transit": "El Dorado County Transit Authority",
        "Fresno County Rural Transit": "Fresno County Rural Transit Agency",
        "Glenn Transit Service": "Glenn County Transportation Commission",
        "Kern Transit": "Kern Regional Transit",
        "Lake Transit": "Lake Transit Authority",
        "Merced The Bus": "Transit Joint Powers Authority for Merced County",
        "OmniTrans": "Omnitrans",
        "Yolo County Transportation District": "Yolo County Transportation District",
        "Visalia Transit": "City of Visalia",
    }
)

#### Merge 2: Fare-Systems with BlackCat/Transit Stacks

In [37]:
# Merge faresystems with m1
m2 = pd.merge(
    m1,
    df_fare3,
    how="left",
    left_on=["organization_name"],
    right_on=["fare_systems_fare_system"],
    indicator=True,
)

In [38]:
m2["_merge"].value_counts()

left_only     130
both           48
right_only      0
Name: _merge, dtype: int64

In [39]:
# Drop unwanted cols
m2 = m2.drop(columns=["fare_systems_fare_system", "_merge"])

In [40]:
len(m2)

178

In [41]:
# m2.sort_values("organization_name")

## Airtable - Service Types & GTFS Status
* Bring in organizations table
* Merge it with bridge table
* Take the merge from above (Black Cat + transit stacks + fare systems) and merge it with services table
* Now the merge has multiple rows for the same organization, as an organization can run multiple services
* Take the merge and aggregate, so only one organization will have one row


### Airtable - Organizations

In [42]:
%%sql -o df_orgs
SELECT 
  CAST(itp_id AS INT) AS itp_id,
  name,
  gtfs_realtime_status,
  key
FROM cal-itp-data-infra.mart_transit_database.dim_organizations

Unnamed: 0,itp_id,name,gtfs_realtime_status,key
0,,Trinity County Transportation Commission,RT Incomplete,recZlzHonm4IQQLW8
1,385.0,San Bernardino County Transportation Authority,RT Incomplete,recQp6uQAUFcenmQ6
2,,Mono County Local Transportation Commission,RT Incomplete,recgOXCxXWr2RENC2
3,,"American Logistics Company, LLC",RT Incomplete,recBki7jgVpdxL6VE
4,,Westminster on Wheels Senior Transportation Program,RT Incomplete,recJkPUGIejdSZEXE
5,,Humboldt County Association of Governments,RT Incomplete,recgynh3mWqnrE0Ot
6,,Castle Rock Associates,RT Incomplete,recWhEjREaQZIZbQr
7,,ACE Parking,RT Incomplete,rec5iC2tEK0ynsSRj
8,,Vivalon Inc.,RT Incomplete,recnwvbbkSUbld9Lk
9,242.0,Paratransit Inc.,RT Incomplete,recAwVdogtIsOLN9Q


In [43]:
# Clean up org names
df_orgs = utils.organization_cleaning(df_orgs, "name")

In [44]:
# Add prefix
df_orgs = df_orgs.add_prefix("airtable_orgs_")

In [45]:
# df_orgs["airtable_orgs_name"].sort_values().unique().tolist()

In [46]:
# Clean up names to match m2
df_orgs["airtable_orgs_name"] = df_orgs["airtable_orgs_name"].replace(
    {
        "City of Tulare": "Tulare County Regional Transportation Agency",
        "Glenn County": "Glenn County Transportation Commission",
        "Plumas County": "Plumas County Transportation Commission",
        "Solano County Transit": "Solano Tranportation Authority",
        "Sonoma County": "Sonoma County Transit",
        "Trinity County": "Trinity County Department of Transportation",
        "SunLine Transit Agency": "Sunline Transit Agency",
        "OmniTrans": "Omnitrans",
        "Lutheran Social Services": "Lutheran Social Services of Southern California",
        "Marin County Transit District": "Marin Transit",
        "Greyhound": "Greyhound Lines",
        "Calaveras County": "Calaveras Transit Agency",
        "Dignity Health": "Dignity Health Connected Living",
        "Butte County Association of Governments": "Butte County Association of Governments Butte Regional Transit",
        "City of Corcoran": "City of Corcoran - Corcoran Area Transit",
        "City of Lafayette": "City of Lafayette: Lamorinda Sprit Van Program",
        "Sonoma County": "County of Sonoma",
        "Siskiyou County": "County of Siskiyou",
        "Tulare County": "Tulare County Regional Transportation Agency",
        "Nevada County": "County of Nevada Public Works",
        "Shasta County": "County of Shasta Department of Public Works",
        "El Dorado County Transit Authority": "El Dorado Transit",
        "Klamath Trinity Non-Emergency Transportation": "Klamath Trinity Non-Emergency Transportation\u200b",
        "Livermore  Amador\n  Valley Transit Authority": "Livermore Amador Valley Transit Authority",
        "Marin County Transit District": "Marin Transit",
        "Modoc Transportation Authority": "Modoc Transportation Agency",
        "On Lok": "On Lok Senior Health Services",
        "Placer County": "Placer County Public Works ",
        "Stanislaus County": "Stanislaus County Public Works - Transit Division",
        "Tehama County": "Tehama County Transit Agency",
        "Tuolumne County Transit Agency": "Tuolumne County Transit",
        "Victor Valley Transit Authority": "Victor Valley Transit",
        "Vivalon Inc.": "Vivalon",
        "Yurok Tribe": "Yurok Tribe Transit",
        "Amador County": "Amador Transit",
    }
)

In [47]:
df_orgs.head(2)

Unnamed: 0,airtable_orgs_itp_id,airtable_orgs_name,airtable_orgs_gtfs_realtime_status,airtable_orgs_key
0,,Trinity County Transportation Commission,RT Incomplete,recZlzHonm4IQQLW8
1,385.0,San Bernardino County Transportation Authority,RT Incomplete,recQp6uQAUFcenmQ6


In [48]:
# Make sure each row is unique
len(df_orgs), df_orgs["airtable_orgs_key"].nunique()

(858, 858)

### Airtable - Bridge

In [49]:
%%sql -o df_bridge
SELECT 
  service_key,
  service_name,
  organization_name,
  organization_key
FROM cal-itp-data-infra.mart_transit_database.bridge_organizations_x_services_managed

Unnamed: 0,service_key,service_name,organization_name,organization_key
0,recnctrE21QdLyEgp,CPMC Shuttle,California Pacific Medical Center,rec00M4y4HSwqPuwg
1,recdvjj3dJVM7g7K7,Long Beach Transit,Long Beach Transit,rec00qSzZL8KqiXAo
2,recpOMMly6rOyqvNc,Long Beach Dial-A-Lift,Long Beach Transit,rec00qSzZL8KqiXAo
3,reckICimEwqMRVP5k,UCLA/Westwood Commuter Express,Long Beach Transit,rec00qSzZL8KqiXAo
4,rec4xLfuGtCQ5kwrI,AquaLink,Long Beach Transit,rec00qSzZL8KqiXAo
5,recvEneClXbNj0HU6,Galaxy Express,Long Beach Transit,rec00qSzZL8KqiXAo
6,recWY1teRovHVinOo,Lynwood Breeze,City of Lynwood,rec0FfOvKIMZu1Qjs
7,recZ62py4jtDtdknV,Angel Island-Tiburon Ferry,Angel Island-Tiburon Ferry Company,rec0HI0gloltUftYg
8,recMHS8KoM0OQoygM,Central Coast LAX Shuttle,Central Coast Shuttle Services Inc.,rec0JkxuORpfgmFci
9,recEwt7QmVNf5Oqe1,Porterville Sheltered Workshop Transportation Services,Porterville Sheltered Workshop,rec0MHEjEQ4MLINEG


In [50]:
# Add prefix
df_bridge = df_bridge.add_prefix("airtable_bridge_")

#### Merge df_orgs with df_bridge 

In [51]:
# 1:m since one org can have many services
airtable1 = pd.merge(
    df_orgs,
    df_bridge,
    how="left",
    left_on=["airtable_orgs_key"],
    right_on=["airtable_bridge_organization_key"],
    validate="1:m",
)

In [52]:
len(airtable1)

1256

In [53]:
airtable1.head(2)

Unnamed: 0,airtable_orgs_itp_id,airtable_orgs_name,airtable_orgs_gtfs_realtime_status,airtable_orgs_key,airtable_bridge_service_key,airtable_bridge_service_name,airtable_bridge_organization_name,airtable_bridge_organization_key
0,,Trinity County Transportation Commission,RT Incomplete,recZlzHonm4IQQLW8,,,,
1,385.0,San Bernardino County Transportation Authority,RT Incomplete,recQp6uQAUFcenmQ6,,,,


### Airtable - California Services

In [54]:
%%sql -o df_services
SELECT 
  name,
  service_type,
  key
FROM cal-itp-data-infra.mart_transit_database.dim_services

Unnamed: 0,name,service_type,key
0,Mariposa Medi-Trans,[NEMT],recfXH5N8eFMV8MrB
1,West Los Angeles DAV Van,[NEMT],recrk2eRi8EKdvyzZ
2,Community Activities and Rehabilitation Transportation,"[NEMT, on-demand, reservations]",recNbrdkmpqQWKXCe
3,Alpine County Transit,[on-demand],recfDW6TcHhQoIRJT
4,The Arc of Amador and Calaveras,[on-demand],recTJcVi30TStxrMh
5,Regional Center for the East Bay,[on-demand],recpgkJUsTaJ5yDmt
6,Inyo-Mono Association for the Handicapped,[on-demand],recpfBPIPPGeEEnaO
7,Self-Help for the Elderly,[on-demand],rec2xKEgiFewj0AKH
8,Casa Allegra,[on-demand],rec80AID7KnWD4muK
9,Friends of Children with Special Needs,[on-demand],recoK36zDqpFz1ERo


In [55]:
# Add prefix
df_services = df_services.add_prefix("airtable_services_")

In [56]:
# https://stackoverflow.com/questions/64795187/pandas-dataframe-possible-to-remove-list-formatting-from-values-inside
# Turn service_type column into just regular values, not a list
df_services[
    "airtable_services_service_type"
] = df_services.airtable_services_service_type.str.join(",")

In [57]:
df_services.head(2)

Unnamed: 0,airtable_services_name,airtable_services_service_type,airtable_services_key
0,Mariposa Medi-Trans,NEMT,recfXH5N8eFMV8MrB
1,West Los Angeles DAV Van,NEMT,recrk2eRi8EKdvyzZ


#### Merge table with services type with bridge-organization dataframe

In [58]:
# Merge
airtable2 = pd.merge(
    airtable1,
    df_services,
    how="left",
    left_on=["airtable_bridge_service_key"],
    right_on=["airtable_services_key"],
)

In [59]:
airtable2.shape

(1256, 11)

In [60]:
# Group the merged dataframe to summarize it down
airtable_group = (
    airtable2.groupby(
        [
            "airtable_orgs_name",
            "airtable_orgs_gtfs_realtime_status",
            "airtable_services_service_type",
        ]
    )
    .agg(
        {
            "airtable_services_key": "max",
            "airtable_orgs_itp_id": "max",
            "airtable_orgs_key": "max",
        }
    )
    .reset_index()
)

In [61]:
airtable_group.shape

(699, 6)

In [62]:
# An organization can have many different services, thus have many dfiferent services types but these are all split among different rows
# Put all elements onto one line, so each org will only have one row instead of multiple ones
airtable_group2 = utils.summarize_rows(
    airtable_group, "airtable_orgs_name", "airtable_services_service_type"
)

In [63]:
# Merge grouped df together to get service key again
airtable_group3 = pd.merge(
    airtable_group,
    airtable_group2,
    how="left",
    left_on=["airtable_orgs_name"],
    right_on=["airtable_orgs_name"],
)

In [64]:
# Drop the disaggregated service type
airtable_group3 = airtable_group3.drop(columns=["airtable_services_service_type_x"])

In [65]:
airtable_group3 = airtable_group3.drop_duplicates(subset=["airtable_orgs_name"])

In [66]:
airtable_group3.shape, airtable_group3["airtable_orgs_name"].nunique()

((445, 6), 445)

In [67]:
airtable_group3.head()

Unnamed: 0,airtable_orgs_name,airtable_orgs_gtfs_realtime_status,airtable_services_key,airtable_orgs_itp_id,airtable_orgs_key,airtable_services_service_type_y
0,ABC Shuttle,RT Incomplete,recH3LvSBaPU5ziZb,,recPvvcm2Ux33tVCU,on-demand
1,ARC Imperial Valley,RT Incomplete,reczYtybeOtlor81j,398.0,recKItoMN0y9EIsvR,"ADA paratransit,NEMT,on-demand"
2,Able Inc.,RT Incomplete,recG3OV7C6lBbK0oa,,rec1jnQxF4U8ElLtn,on-demand
3,Abrazar Inc.,RT Incomplete,reczYPXKwVK0OcgLd,,recXhL9MKpEktKEXU,NEMT
4,Access Services,RT Incomplete,recKWjRmhK1KSKWwx,1.0,recJtH0Ae8YNo01aj,ADA paratransit


## Merge Airtable DF from above with M2 (BlackCat & Fare-Systems)
* Maybe try with service key later too?

In [68]:
m3 = pd.merge(
    m2,
    airtable_group3,
    how="left",
    left_on=["organization_name"],
    right_on=["airtable_orgs_name"],
    indicator=True,
)

In [69]:
m3["_merge"].value_counts()

both          110
left_only      68
right_only      0
Name: _merge, dtype: int64

In [70]:
# left_only = m3.loc[m3["_merge"] == "left_only"]
# left_only["organization_name"].sort_values().unique().tolist()

In [71]:
m3[["organization_name", "airtable_orgs_name", "_merge"]].sort_values(
    "organization_name"
)

Unnamed: 0,organization_name,airtable_orgs_name,_merge
93,ARC Imperial Valley,ARC Imperial Valley,both
132,Alegria Community Living,,left_only
25,Alpine County Community Development,,left_only
26,Amador Transit,,left_only
92,Angel View,,left_only
174,Area 1 Agency on Aging,,left_only
94,Asian Community Center of Sacramento Valley,Asian Community Center of Sacramento Valley,both
133,Bay Area Outreach & Recreation Program,Bay Area Outreach & Recreation Program,both
6,Butte County Association of Governments Butte Regional Transit,Butte County Association of Governments Butte Regional Transit,both
2,Calaveras Transit Agency,Calaveras Transit Agency,both


## NTD 

In [72]:
df_vehicles = utils.clean_vehicles_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [73]:
# Add prefix
df_vehicles = df_vehicles.add_prefix("ntd_")

In [74]:
# Keep certain cols
df_vehicles2 = df_vehicles[["ntd_agency", "ntd_total_vehicles"]]

In [75]:
# Clean organization name
df_vehicles2 = utils.organization_cleaning(df_vehicles2, "ntd_agency")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [76]:
# Keep everything before dba
df_vehicles2["ntd_agency"] = df_vehicles2["ntd_agency"].str.split("dba: ").str[-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [77]:
# Clean up names to match M3
df_vehicles2["ntd_agency"] = df_vehicles2["ntd_agency"].replace(
    {
        "Butte County Association of Governments": "Butte County Association of Governments Butte Regional Transit",
        "Calaveras Transit Agency": "Calaveras Transit",
        "Wasco": "City of Wasco",
        "Tulare County Area Transit": "Tulare County Regional Transportation Agency",
        "El Dorado County Transit Authority": "El Dorado Transit",
        "Glenn Transit Service": "Glenn County Transportation Commission",
        "Kern Transit": "Kern Regional Transit",
        "Livermore  Amador Valley Transit Authority": "Livermore Amador Valley Transit Authority",
        "Madera County, dba: Madera County Connection Transit": "Madera County",
        "Marin Transit": "Marin County Transit District",
        "Mountain Area Regional Transit Authority, dba: Mountain Transit": "Mountain Area Regional Transit Authority",
        "OmniTrans": "Omnitrans",
        "Riverside Transit": "Riverside Transit Agency",
        "San Mateo County Transit District, dba: SamTrans": "San Mateo County Transit District",
        "Stanislaus County": "Stanislaus County Public Works - Transit Division",
        "Tehama County": "Tehama County Transit Agency",
        "Transit Joint Powers Authority for Merced County": "Transit Joint Powers Authority for Merced County",
        "Trinity County": "Trinity County Department of Transportation",
        "Tulare County Area Transit": "Tulare County Regional Transportation Agency",
        "Victor Valley Transit": "Victor Valley Transit Authority",
        "Yolo County Transportation District": "Yolobus",
        "Yurok Tribe": "Yurok Tribe Transit",
        "Tuolumne County Transit": "Tuolumne County Transit Agency ",
        "SunLine Transit Agency": "Sunline Transit Agency",
        "Mariposa County Transit": "County of Mariposa",
        "City of Visalia ": "City of Visalia",
        "City of Corcoran": "City of Corcoran - Corcoran Area Transit",
        "Calaveras Transit": "Calaveras Transit Agency",
        "Amador Regional Transit System": "Amador Transit",
    }
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [78]:
m4 = pd.merge(
    m3,
    df_vehicles2,
    how="left",
    left_on=["organization_name"],
    right_on=["ntd_agency"],
)

In [79]:
# df_vehicles2["ntd_agency"].sort_values().unique().tolist()

## Export/Final Clean Up

In [80]:
# Percentage of null values in each column
m4.isnull().sum() * 100 / len(m4)

5311_5310_overlap                        0.000000
organization_name                        0.000000
service_components_fare_component       88.202247
service_components_fare_product         88.202247
service_components_AVL_CAD_component    91.011236
service_components_AVL_CAD_product      91.011236
service_components_service_key          91.011236
fare_systems_electronic_fare_program    94.382022
fare_systems_ticket_media               76.966292
fare_systems_payment_accepted           78.651685
airtable_orgs_name                      38.202247
airtable_orgs_gtfs_realtime_status      38.202247
airtable_services_key                   38.202247
airtable_orgs_itp_id                    45.505618
airtable_orgs_key                       38.202247
airtable_services_service_type_y        38.202247
_merge                                   0.000000
ntd_agency                              56.179775
ntd_total_vehicles                      56.179775
dtype: float64

In [81]:
# Columns to fill in empty rows with N/A
columns_for_na = [
    "service_components_fare_component",
    "service_components_fare_product",
    "service_components_AVL_CAD_component",
    "service_components_AVL_CAD_product",
    "service_components_service_key",
    "fare_systems_electronic_fare_program",
    "fare_systems_ticket_media",
    "airtable_orgs_name",
    "airtable_orgs_gtfs_realtime_status",
    "airtable_orgs_itp_id",
    "airtable_orgs_key",
    "airtable_services_service_type_y",
    "ntd_agency",
    "ntd_total_vehicles",
]

In [82]:
m4[columns_for_na] = m4[columns_for_na].fillna("N/A")

In [84]:
# Some of the service types can be duplicated a few times, delete the duplicates
m4["airtable_services_service_type_y"] = m4["airtable_services_service_type_y"].apply(
   lambda x: ", ".join(set([y.strip() for y in x.split(",")])) )

In [85]:
# Clean up column names
m4 = utils.clean_up_columns(m4)

In [86]:
# Drop columns that aren't relevant
m4 = m4.drop(
    columns=[
        "Airtable Orgs Key",
        "Merge",
        "Airtable Orgs Name",
        "Ntd Agency",
        "Airtable Services Key",
        "Service Components Service Key",
    ]
)

In [87]:
m4.sort_values("Organization Name")

Unnamed: 0,5311 5310 Overlap,Organization Name,Service Components Fare Component,Service Components Fare Product,Service Components Avl Cad Component,Service Components Avl Cad Product,Fare Systems Electronic Fare Program,Fare Systems Ticket Media,Fare Systems Payment Accepted,Airtable Orgs Gtfs Realtime Status,Airtable Orgs Itp Id,Airtable Services Service Type Y,Ntd Total Vehicles
93,5310 only,ARC Imperial Valley,,,,,,,,RT Incomplete,398.0,"on-demand, ADA paratransit, NEMT",
132,5310 only,Alegria Community Living,,,,,,,,,,,
25,5311 only,Alpine County Community Development,,,,,,,,,,,
26,Both 5311 and 5310,Amador Transit,Cash Farebox,Cashbox,AVL Software,Trackit,,,cash,,,,24.0
92,5310 only,Angel View,,,,,,,,,,,
174,5310 only,Area 1 Agency on Aging,,,,,,,,,,,
94,5310 only,Asian Community Center of Sacramento Valley,,,,,,,,RT Incomplete,400.0,"on-demand, reservations, NEMT",
133,5310 only,Bay Area Outreach & Recreation Program,,,,,,,,RT Incomplete,402.0,event,
6,Both 5311 and 5310,Butte County Association of Governments Butte Regional Transit,,,,,,,,RT Incomplete,47.0,"reservations, on-demand, fixed-route, ADA paratransit",53.0
2,5311 only,Calaveras Transit Agency,,,,,,,,RT Incomplete,50.0,"on-demand, deviated fixed-route, reservations",


In [88]:

with pd.ExcelWriter(
    "gs://calitp-analytics-data/data-analyses/5311-5310/5311_5310_payments.xlsx"
) as writer:
    m3.to_excel(writer, sheet_name="main", index=True)
