# 5311 and 5310 Applicant Payment First Look
* An agency in this analysis is a recipient of 5311/5310/or both funds

In [1]:
import calitp_data_analysis.magics
import pandas as pd
import utils
from calitp_data_analysis.sql import *
from calitp_data_analysis.tables import tbls
from siuba import *

pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)



## Black Cat

In [None]:
# Read in Black Cat: 5311 and 5310 recipients with projects in past 2 years
df_bc = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/5311-5310/5311_5310_applicants.parquet",
    engine="auto",
)

In [None]:
# Clean up organization names
df_bc = utils.organization_cleaning(df_bc, "organization_name")

In [None]:
# df_bc["organization_name"].sort_values().unique().tolist()

## Airtable - Transit Stacks, Service Components

In [None]:
%%sql -o df_service_components
SELECT 
  service_name,
  product_name,
  component_name,
  service_key
FROM cal-itp-data-infra.mart_transit_database.dim_service_components


In [None]:
# df_service_components["service_components_service_name"].sort_values().unique()

In [None]:
# Add prefix
df_service_components = df_service_components.add_prefix("service_components_")

In [None]:
# Clean organization name
df_service_components = utils.organization_cleaning(
    df_service_components, "service_components_service_name"
)

In [None]:
# df_service_components["service_components_service_name"].sort_values().unique().tolist()

In [None]:
# Clean up Transit Stacks names to match blackcat
df_service_components["service_components_service_name"] = df_service_components[
    "service_components_service_name"
].replace(
    {
        "Solano Express": "Solano Tranportation Authority",
        "Santa Cruz METRO": "Santa Cruz Metropolitan Transit District",
        "MTS Bus": "San Diego Metropolitan Transit System",
        "VTA Bus": "Santa Clara Valley Transportation Authority",
        "Redwood Coast Transit": "Redwood Coast Transit Authority",
        "Mountain Transit": "Mountain Area Regional Transit Authority",
        "Marin Transit": "Marin County Transit District",
        "Visalia Transit": "City of Visalia",
        "Lassen Rural Bus": "Lassen Transit Service Agency",
        "Ojai Trolley": "City of Ojai",
        "Siskiyou Transit and General Express": "County of Siskiyou",
        "Madera County Connection": "Madera County",
        "Tulare County Area Transit": "Tulare County Regional Transportation Agency",
        "Trinity County Department of Transportation": "Trinity County Department of Transportation",
        "Santa Maria Area Transit": "City of Santa Maria",
        "Tuolumne County Transit Agency": "Tuolumne County Transit Agency",
        "OmniTrans": "Omnitrans",
        "Eastern Sierra Transit Authority Community Routes": "Eastern Sierra Transit Authority",
        "Fresno County Rural Transit": "Fresno County Rural Transit Agency",
        "El Dorado Transit": "El Dorado County Transit Authority",
        "Palos Verdes Peninsula Transit Authority": "Palo Verde Valley Transit Agency",
        "Kern Transit": "Kern Regional Transit",
        "Riverside Transit Agency": "Riverside Transit",
        "Tuolumne County Transit": "Tuolumne County Transit Agency",
        "Yuba-Sutter Transit": "Yuba-Sutter Transit Authority",
        "County of Sonoma": "Sonoma County Transit",
        "SunLine Transit": "Sunline Transit Agency",
        "Trinity Transit": "Trinity County Department of Transportation",
        "Yuba-Sutter Transit": "Yuba-Sutter Transit Authority",
    }
)

#### To make things a little clearer: have 2 columns for fare related products/comps and 2 columns for CAD/AVL related products/comps

In [None]:
# List of fare
fare_comps = [
    "Mobile ticketing",
    "Cash Farebox",
    "Fare card system",
    "Ticket Vending Machines",
    "Contactless Payment Validators",
    "Payment processor",
]

In [None]:
# Subset dataframe for fare related components
# This function grabs all the different components/products a company purchases and places it on one line.
df_fare_comps = utils.service_comps_summarize(df_service_components, fare_comps)

In [None]:
df_fare_comps = df_fare_comps.rename(
    columns={
        "service_components_component_name": "service_components_fare_component",
        "service_components_product_name": "service_components_fare_product",
    }
)

In [None]:
# Subset out for CAD/AVL
cad_avl_comps = [
    "AVL Software",
    "Location Sensors",
    "AVL On-board Computer",
]

In [None]:
# Subset dataframe for cad/avl
df_cad_avl = utils.service_comps_summarize(df_service_components, cad_avl_comps)

In [None]:
df_cad_avl = df_cad_avl.rename(
    columns={
        "service_components_component_name": "service_components_AVL_CAD_component",
        "service_components_product_name": "service_components_AVL_CAD_product",
    }
)

In [None]:
df_cad_avl["service_components_service_name"].nunique(), df_fare_comps[
    "service_components_service_name"
].nunique()

### Merge Fare Components with Black Cat: M1

In [None]:
# Merge transit stacks with BC
m1 = pd.merge(
    df_bc,
    df_fare_comps,
    how="left",
    left_on=["organization_name"],
    right_on=["service_components_service_name"],
)

In [None]:
m1.shape

In [None]:
len(m1)

In [None]:
m1 = m1.drop(columns=["service_components_service_name"])

### Merge AVL/CAD Components 

In [None]:
# Merge BC with fare info along with AVL/CAD info
m1 = pd.merge(
    m1,
    df_cad_avl,
    how="left",
    left_on=["organization_name"],
    right_on=["service_components_service_name"],
)

In [None]:
# Merge in service keys which got lost somewhere...
m1 = pd.merge(
    m1,
    df_service_components,
    how="left",
    on="service_components_service_name",
)

In [None]:
m1 = m1.drop(
    columns=[
        "service_components_service_name",
        "service_components_component_name",
        "service_components_product_name",
    ]
)

In [None]:
m1 = m1.drop_duplicates()

In [None]:
m1.shape

## Airtable - Fare Systems 
* Using CSV for now since the table isn't in the warehouse yet.

In [None]:
"""
df_fare = to_snakecase(
    pd.read_csv(
        "gs://calitp-analytics-data/data-analyses/5311-5310/fare systems-Grid view.csv"
    )
)
"""

In [None]:
%%sql -o df_fare
SELECT 
  fare_system,
  electronic_fare_program,
  ticket_media, 
  payment_accepted
FROM cal-itp-data-infra.mart_transit_database.dim_fare_systems

In [None]:
# Only grab columns I'm interested in
cols_wanted = [
    "fare_system",
    "electronic_fare_program",
    "ticket_media",
    "payment_accepted",
]

In [None]:
# Subset dataframe
df_fare2 = df_fare[cols_wanted]

In [None]:
# Keep only the rows with at least 2 non-NA values.
df_fare3 = df_fare2.dropna(thresh=2)

In [None]:
# Compare the two dataframes
df_fare2.shape, df_fare3.shape

In [None]:
# Add prefix
df_fare3 = df_fare3.add_prefix("fare_systems_")

In [None]:
# Clean organization name
df_fare3 = utils.organization_cleaning(df_fare3, "fare_systems_fare_system")

In [None]:
# df_fare3["fare_systems_fare_system"].sort_values().unique().tolist()

In [None]:
# Clean up names to match Fare Systems
df_fare3["fare_systems_fare_system"] = df_fare3["fare_systems_fare_system"].replace(
    {
        "Arcata and Mad River Transit System": "City of Arcata",
        "Arvin Transit": "City of Arvin",
        "Auburn Transit": "City of Arvin",
        "Guadalupe Flyer": "City of Guadalupe",
        "Plumas Transit Systems": "Plumas County Transportation Commission",
        "Ojai Trolley": "City of Ojai",
        "Needles Area Transit": "City of Needles",
        "Porterville Transit": "City of Porterville",
        "Ridgecrest Transit": "City of Ridgecrest",
        "Rio Vista Delta Breeze": "City of Rio Vista",
        "Madera County Connection": "Madera County",
        "SolTrans": "Solano Tranportation Authority",
        "Siskiyou Transit and General Express": "County of Siskiyou",
        "Mountain Transit": "Mountain Area Regional Transit Authority",
        "Plumas Transit Systems": "Plumas County Transportation Commission",
        "San Benito County Express": "San Benito County Local Transportation Authority",
        "Sage Stage": "Modoc Transportation Agency",
        "Vine Transit": "Napa Valley Transportation Authority",
        "San Diego Metropolitan Transit System": "San Diego Metropolitan Transit System",
        "Stanislaus Regional Transit": "Stanislaus Regional Transit Authority",
        "Amador Regional Transit System": "Amador Transit",
        "Calaveras Transit Agency": "Calaveras Transit",
        "Eastern Sierra Transit Authority": "Eastern Sierra Transit Authority Community Routes",
        "Fresno County Rural Transit Agency": "Fresno County Rural Transit",
        "Marin Transit": "Marin County Transit District",
        "Redding Area Bus Authority": "Redwood Coast Transit Authority",
        "SunLine Transit Agency": "Sunline Transit Agency",
        "Victor Valley Transit": "Victor Valley Transit Authority",
        "Yolobus": "Yolo County Transportation District",
        "Corona Cruiser": "City of Corcoran - Corcoran Area Transit",
        "Dinuba Area Regional Transit": "City of Dinuba",
        "Eastern Sierra Transit Authority Community Routes": "El Dorado County Transit Authority",
        "El Dorado Transit": "El Dorado County Transit Authority",
        "Fresno County Rural Transit": "Fresno County Rural Transit Agency",
        "Glenn Transit Service": "Glenn County Transportation Commission",
        "Kern Transit": "Kern Regional Transit",
        "Lake Transit": "Lake Transit Authority",
        "Merced The Bus": "Transit Joint Powers Authority for Merced County",
        "OmniTrans": "Omnitrans",
        "Yolo County Transportation District": "Yolo County Transportation District",
        "Visalia Transit": "City of Visalia",
    }
)

In [None]:
# df_fare3.sort_values('fare_systems_fare_system')

#### Merge 2: Fare-Systems with BlackCat/Transit Stacks

In [None]:
# Merge faresystems with m1
m2 = pd.merge(
    m1,
    df_fare3,
    how="left",
    left_on=["organization_name"],
    right_on=["fare_systems_fare_system"],
    indicator=True,
)

In [None]:
m2["_merge"].value_counts()

In [None]:
# Drop unwanted cols
m2 = m2.drop(columns=["fare_systems_fare_system", "_merge"])

In [None]:
len(m2)

In [None]:
# m2.sort_values("organization_name")

## Airtable - Service Types & GTFS Status
* Bring in organizations table
* Merge it with bridge table
* Take the merge from above (Black Cat + transit stacks + fare systems) and merge it with services table
* Now the merge has multiple rows for the same organization, as an organization can run multiple services
* Take the merge and aggregate, so only one organization will have one row


### Airtable - Organizations

In [None]:
%%sql -o df_orgs
SELECT 
  CAST(itp_id AS INT) AS itp_id,
  name,
  gtfs_realtime_status,
  key
FROM cal-itp-data-infra.mart_transit_database.dim_organizations

In [None]:
# Clean up org names
df_orgs = utils.organization_cleaning(df_orgs, "name")

In [None]:
# Add prefix
df_orgs = df_orgs.add_prefix("airtable_orgs_")

In [None]:
# df_orgs["airtable_orgs_name"].sort_values().unique().tolist()

In [None]:
# Clean up names to match m2
df_orgs["airtable_orgs_name"] = df_orgs["airtable_orgs_name"].replace(
    {
        "City of Tulare": "Tulare County Regional Transportation Agency",
        "Glenn County": "Glenn County Transportation Commission",
        "Plumas County": "Plumas County Transportation Commission",
        "Solano County Transit": "Solano Tranportation Authority",
        "Sonoma County": "Sonoma County Transit",
        "Trinity County": "Trinity County Department of Transportation",
        "SunLine Transit Agency": "Sunline Transit Agency",
        "OmniTrans": "Omnitrans",
        "Lutheran Social Services": "Lutheran Social Services of Southern California",
        "Marin County Transit District": "Marin Transit",
        "Greyhound": "Greyhound Lines",
        "Calaveras County": "Calaveras Transit Agency",
        "Dignity Health": "Dignity Health Connected Living",
        "Butte County Association of Governments": "Butte County Association of Governments Butte Regional Transit",
        "City of Corcoran": "City of Corcoran - Corcoran Area Transit",
        "City of Lafayette": "City of Lafayette: Lamorinda Sprit Van Program",
        "Sonoma County": "County of Sonoma",
        "Siskiyou County": "County of Siskiyou",
        "Tulare County": "Tulare County Regional Transportation Agency",
        "Nevada County": "County of Nevada Public Works",
        "Shasta County": "County of Shasta Department of Public Works",
        "El Dorado County Transit Authority": "El Dorado Transit",
        "Klamath Trinity Non-Emergency Transportation": "Klamath Trinity Non-Emergency Transportation\u200b",
        "Livermore  Amador\n  Valley Transit Authority": "Livermore Amador Valley Transit Authority",
        "Marin Transit": "Marin County Transit District",
        "Modoc Transportation Authority": "Modoc Transportation Agency",
        "On Lok": "On Lok Senior Health Services",
        "Placer County": "Placer County Public Works ",
        "Stanislaus County": "Stanislaus County Public Works - Transit Division",
        "Tehama County": "Tehama County Transit Agency",
        "Tuolumne County Transit Agency": "Tuolumne County Transit",
        "Victor Valley Transit Authority": "Victor Valley Transit",
        "Vivalon Inc.": "Vivalon",
        "Yurok Tribe": "Yurok Tribe Transit",
        "Amador County": "Amador Transit",
    }
)

In [None]:
# df_orgs.sort_values('airtable_orgs_name')

In [None]:
# Make sure each row is unique
len(df_orgs), df_orgs["airtable_orgs_key"].nunique()

### Airtable - Bridge

In [None]:
%%sql -o df_bridge
SELECT 
  service_key,
  service_name,
  organization_name,
  organization_key
FROM cal-itp-data-infra.mart_transit_database.bridge_organizations_x_services_managed

In [None]:
# Add prefix
df_bridge = df_bridge.add_prefix("airtable_bridge_")

#### Merge df_orgs with df_bridge 

In [None]:
# 1:m since one org can have many services
airtable1 = pd.merge(
    df_orgs,
    df_bridge,
    how="left",
    left_on=["airtable_orgs_key"],
    right_on=["airtable_bridge_organization_key"],
    validate="1:m",
)

In [None]:
len(airtable1)

In [None]:
airtable1.head(2)

### Airtable - California Services

In [None]:
%%sql -o df_services
SELECT 
  name,
  service_type,
  key
FROM cal-itp-data-infra.mart_transit_database.dim_services

In [None]:
# Add prefix
df_services = df_services.add_prefix("airtable_services_")

In [None]:
# https://stackoverflow.com/questions/64795187/pandas-dataframe-possible-to-remove-list-formatting-from-values-inside
# Turn service_type column into just regular values, not a list
df_services[
    "airtable_services_service_type"
] = df_services.airtable_services_service_type.str.join(",")

In [None]:
df_services.head(2)

#### Merge table with services type with bridge-organization dataframe

In [None]:
# Merge
airtable2 = pd.merge(
    airtable1,
    df_services,
    how="left",
    left_on=["airtable_bridge_service_key"],
    right_on=["airtable_services_key"],
)

In [None]:
airtable2.shape

In [None]:
# Group the merged dataframe to summarize it down
airtable_group = (
    airtable2.groupby(
        [
            "airtable_orgs_name",
            "airtable_orgs_gtfs_realtime_status",
            "airtable_services_service_type",
        ]
    )
    .agg(
        {
            "airtable_services_key": "max",
            "airtable_orgs_itp_id": "max",
            "airtable_orgs_key": "max",
        }
    )
    .reset_index()
)

In [None]:
airtable_group.shape

In [None]:
# An organization can have many different services, thus have many dfiferent services types but these are all split among different rows
# Put all elements onto one line, so each org will only have one row instead of multiple ones
airtable_group2 = utils.summarize_rows(
    airtable_group, "airtable_orgs_name", "airtable_services_service_type"
)

In [None]:
# Merge grouped df together to get service key again
airtable_group3 = pd.merge(
    airtable_group,
    airtable_group2,
    how="left",
    left_on=["airtable_orgs_name"],
    right_on=["airtable_orgs_name"],
)

In [None]:
# Drop the disaggregated service type
airtable_group3 = airtable_group3.drop(columns=["airtable_services_service_type_x"])

In [None]:
airtable_group3 = airtable_group3.drop_duplicates(subset=["airtable_orgs_name"])

In [None]:
airtable_group3.shape, airtable_group3["airtable_orgs_name"].nunique()

In [None]:
airtable_group3[
    [
        "airtable_orgs_name",
        "airtable_orgs_gtfs_realtime_status",
        "airtable_orgs_itp_id",
        "airtable_services_service_type_y",
    ]
].sort_values("airtable_orgs_name")

## Merge Airtable DF from above with M2 (BlackCat & Fare-Systems)
* Maybe try with service key later too?

In [None]:
m3 = pd.merge(
    m2,
    airtable_group3,
    how="left",
    left_on=["organization_name"],
    right_on=["airtable_orgs_name"],
    indicator=True,
)

In [None]:
m3["_merge"].value_counts()

In [None]:
# left_only = m3.loc[m3["_merge"] == "left_only"]
# left_only["organization_name"].sort_values().unique().tolist()

In [None]:
"""
m3[["organization_name", "airtable_orgs_name", "_merge"]].sort_values(
    "organization_name"
)
"""

## NTD 

In [None]:
df_vehicles = utils.clean_vehicles_data()

In [None]:
# Add prefix
df_vehicles = df_vehicles.add_prefix("ntd_")

In [None]:
# Keep certain cols
df_vehicles2 = df_vehicles[["ntd_agency", "ntd_total_vehicles"]]

In [None]:
# Clean organization name
df_vehicles2 = utils.organization_cleaning(df_vehicles2, "ntd_agency")

In [None]:
# Keep everything before dba
df_vehicles2["ntd_agency"] = df_vehicles2["ntd_agency"].str.split("dba: ").str[-1]

In [None]:
# Clean up names to match M3
df_vehicles2["ntd_agency"] = df_vehicles2["ntd_agency"].replace(
    {
        "Butte County Association of Governments": "Butte County Association of Governments Butte Regional Transit",
        "Calaveras Transit Agency": "Calaveras Transit",
        "Wasco": "City of Wasco",
        "Tulare County Area Transit": "Tulare County Regional Transportation Agency",
        "El Dorado County Transit Authority": "El Dorado Transit",
        "Glenn Transit Service": "Glenn County Transportation Commission",
        "Kern Transit": "Kern Regional Transit",
        "Marin Transit": "Marin County Transit District",
        "OmniTrans": "Omnitrans",
        "Riverside Transit": "Riverside Transit Agency",
        "Stanislaus County": "Stanislaus County Public Works - Transit Division",
        "Tehama County": "Tehama County Transit Agency",
        "Transit Joint Powers Authority for Merced County": "Transit Joint Powers Authority for Merced County",
        "Trinity County": "Trinity County Department of Transportation",
        "Tulare County Area Transit": "Tulare County Regional Transportation Agency",
        "Victor Valley Transit": "Victor Valley Transit Authority",
        "Yolobus": "Yolo County Transportation District",
        "Yurok Tribe": "Yurok Tribe Transit",
        "Tuolumne County Transit": "Tuolumne County Transit Agency ",
        "SunLine Transit Agency": "Sunline Transit Agency",
        "Mariposa County Transit": "County of Mariposa",
        "City of Visalia ": "City of Visalia",
        "City of Corcoran": "City of Corcoran - Corcoran Area Transit",
        "Calaveras Transit": "Calaveras Transit Agency",
        "Amador Regional Transit System": "Amador Transit",
    }
)

In [None]:
m4 = pd.merge(
    m3,
    df_vehicles2,
    how="left",
    left_on=["organization_name"],
    right_on=["ntd_agency"],
)

In [None]:
df_vehicles2.sort_values("ntd_agency")

## Export/Final Clean Up

In [None]:
# Percentage of null values in each column
m4.isnull().sum() * 100 / len(m4)

In [None]:
# Columns to fill in empty rows with N/A
columns_for_na = [
    "service_components_fare_component",
    "service_components_fare_product",
    "service_components_AVL_CAD_component",
    "service_components_AVL_CAD_product",
    "service_components_service_key",
    "fare_systems_electronic_fare_program",
    "fare_systems_ticket_media",
    "airtable_orgs_name",
    "airtable_orgs_gtfs_realtime_status",
    "airtable_orgs_itp_id",
    "airtable_orgs_key",
    "airtable_services_service_type_y",
    "ntd_agency",
    "ntd_total_vehicles",
    "fare_systems_payment_accepted",
]

In [None]:
m4[columns_for_na] = m4[columns_for_na].fillna("N/A")

In [None]:
# Some of the service types can be duplicated a few times, delete the duplicates
m4["airtable_services_service_type_y"] = m4["airtable_services_service_type_y"].apply(
    lambda x: ", ".join(set([y.strip() for y in x.split(",")]))
)

In [None]:
# Clean up column names
m4 = utils.clean_up_columns(m4)

In [None]:
# Drop columns that aren't relevant
m4 = m4.drop(
    columns=[
        "Airtable Orgs Key",
        "Merge",
        "Airtable Orgs Name",
        "Ntd Agency",
        "Airtable Services Key",
        "Service Components Service Key",
    ]
)

In [None]:
# m4[["Organization Name", "Ntd Total Vehicles"]].sort_values("Organization Name")

In [None]:
"""
with pd.ExcelWriter(
    "gs://calitp-analytics-data/data-analyses/5311-5310/5311_5310_payments.xlsx"
) as writer:
    m4.to_excel(writer, sheet_name="main", index=True)
    """