In [1]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## Catastrophic

In [8]:
def load_catastrophic_errors(excel_file: str):
    df = pd.read_excel(excel_file)
    df = to_snakecase(df)

    # Don't look at LACMTA stuff
    no_lacmta = df[~df["dim_gtfs_datasets_→_uri"].str.contains("LACMTA")]

    no_lacmta2 = no_lacmta[
        [
            "dim_provider_gtfs_data_→_service_name",
            "dim_county_geography_→_caltrans_district",
            "dim_gtfs_datasets_→_uri",
            "date",
        ]
    ].sort_values(
        ["dim_provider_gtfs_data_→_service_name", "date"], ascending=[False, False]
    )

    no_lacmta2 = no_lacmta2.rename(
        columns={
            "dim_provider_gtfs_data_→_service_name": "service_name",
            "dim_county_geography_→_caltrans_district": "district",
            "dim_gtfs_datasets_→_uri": "uri",
        }
    )
    return no_lacmta2

In [9]:
cat_df = load_catastrophic_errors(
    "./feed_info_expired_feeds_2023-09-29T15_22_09.307845Z.xlsx"
)

  warn("Workbook contains no default style, apply openpyxl's default")


In [10]:
cat_df

Unnamed: 0,service_name,district,uri,date
8,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-28
18,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-27
28,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-26
38,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-25
48,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-24
58,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-23
68,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-22
78,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-21
88,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-20
98,TRACER,10,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,2023-09-19


In [37]:
def load_airtable(csv_file: str):
    df = to_snakecase(pd.read_csv(csv_file))

    df = df.fillna("None")

    df = (
        df[
            [
                "gtfs_datasets",
                "services",
                "issue_type",
                "description",
            ]
        ]
        .sort_values(["gtfs_datasets", "services"])
        .reset_index(drop=True)
    )

    df["airtable_ticket"] = "Yes"

    return df

In [38]:
airtable = load_airtable(
    "./Transit Data Quality Issues-Open Issues by Distrcit (1).csv"
)

In [39]:
def summarize_cat(
    catastrophic_data: pd.DataFrame, airtable_data: pd.DataFrame
) -> pd.DataFrame:

    cat_summary = (
        catastrophic_data.groupby(["service_name", "uri"])
        .agg({"date": "count"})
        .reset_index()
        .rename(columns={"date": "# of days with expired feed"})
    )

    display(cat_summary)

    m1 = pd.merge(
        cat_summary,
        airtable_data,
        left_on=["service_name"],
        right_on=["services"],
        how="left",
    )

    display(m1)

In [40]:
summarize_cat(cat_df, airtable)

Unnamed: 0,service_name,uri,# of days with expired feed
0,Morro Bay Transit,https://mjcaction.com/MJC_GTFS_Public/morrobay_google_transit.zip,14
1,Plumas Transit Systems,https://data.trilliumtransit.com/gtfs/plumas-ca-us/plumas-ca-us.zip,14
2,TRACER,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,14


Unnamed: 0,service_name,uri,# of days with expired feed,gtfs_datasets,services,issue_type,description,airtable_ticket
0,Morro Bay Transit,https://mjcaction.com/MJC_GTFS_Public/morrobay_google_transit.zip,14,Morro Bay Schedule,Morro Bay Transit,Expired Schedule Feed,City of Morro Bay: Follow-up about GTFS Schedule,Yes
1,Plumas Transit Systems,https://data.trilliumtransit.com/gtfs/plumas-ca-us/plumas-ca-us.zip,14,Plumas Schedule,Plumas Transit Systems,Expired Schedule Feed,"Based on the catastrophic errors dashboard, Plumas has not produced data since September 2. Per Evan, ""Plumas has a feed_info.txt file that says the feed was only valid until September 1 while the calendar says the feed is valid all the way until December 1. A strict interpretation of the GTFS spec says that any service beyond the feed end date is merely advisory, but some trip planners may still chose to display the service up until the end date."" This is the second ticket opened for Plumas.",Yes
2,TRACER,https://data.trilliumtransit.com/gtfs/tracy-ca-us/tracy-ca-us.zip,14,Tracy Schedule,TRACER,Expired Schedule Feed,City of Tracy: GTFS Schedule Feed Expired,Yes


## Incompleteness

In [41]:
def load_tu_or_vp(excel_file: str, column_to_filter: str) -> pd.DataFrame:
    df = to_snakecase(pd.read_excel(excel_file))
    df = (df[df[column_to_filter] < 41].sort_values([column_to_filter])).reset_index(
        drop=True
    )
    return df

In [42]:
def api_511(excel_file: str) -> pd.DataFrame:
    # Read in 511
    df = to_snakecase(pd.read_excel(excel_file))
    df = df.rename(columns={df.columns[0]: "new"})

    # Only keep rows that have the string name or monitored
    df = df[df["new"].str.contains(("Name|Monitored"))].reset_index(drop=True)

    # Get rid of random characters
    df.new = (
        df.new.str.replace(">", "")
        .str.replace("ShortName", "")
        .str.replace("Name", "")
        .str.replace("Monitored", "")
        .str.replace("<", "")
        .str.replace("/", "")
    )

    # display(df)
    print("These are Bay Area feeds that keep track of RT")
    display(df[(df.new == "true").shift(1).fillna(False)])

In [43]:
api_511("API511.xlsx")

These are Bay Area feeds that keep track of RT


Unnamed: 0,new
13,AC TRANSIT
22,Bay Area Rapid Transit
25,Caltrain
37,County Connection
40,Dumbarton Express Consortium
43,Emery Go-Round
46,FAST
52,Golden Gate Transit
55,Livermore Amador Valley Transit Authority
58,Marin Transit


In [55]:
def incomplete(tu_excel_file: str, vp_excel_file: str, airtable: pd.DataFrame):
    tu_df = load_tu_or_vp(tu_excel_file, "%_of_trips_with_tu_messages")
    vp_df = load_tu_or_vp(vp_excel_file, "%_of_trips_with_vp_messages")

    incomplete = pd.merge(tu_df, vp_df, on="name", how="outer")
    incomplete = incomplete.sort_values(["name"]).reset_index(drop=True)

    incomplete = incomplete.fillna("OK")

    # incomplete.name = incomplete.name.str.replace('Schedule','')
    incomplete2 = (
        pd.merge(
            incomplete, airtable, left_on="name", right_on="gtfs_datasets", how="left"
        )
        .sort_values("name")
        .fillna("NA")
    )
    display(incomplete2)

In [56]:
# airtable.loc[airtable.issue_type.str.contains('Completeness')]

In [57]:
incomplete(
    "./gtfs_rt_trip_updates_completeness__last_14_days__2023-09-29T15_22_26.671152Z.xlsx",
    "./gtfs_rt_vehicle_positions_completeness__last_14_days__2023-09-29T15_22_29.375218Z.xlsx",
    airtable,
)

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,name,%_of_trips_with_tu_messages,%_of_trips_with_vp_messages,gtfs_datasets,services,issue_type,description,airtable_ticket
0,Anaheim Resort Schedule,20.80,17.65,Anaheim Resort Schedule,Anaheim Resort Transportation,Service Accuracy,Anaheim Transportation Network: GTFS-Schedule issue: thousands of duplicate trips on Toy Story Line,Yes
1,Bay Area 511 ACE Schedule,0.00,0.00,,,,,
2,Bay Area 511 Angel Island-Tiburon Ferry Schedule,0.00,0.00,,,,,
3,Bay Area 511 BART Schedule,OK,0.00,,,,,
4,Bay Area 511 Capitol Corridor Schedule,0.00,0.00,,,,,
5,Bay Area 511 Commute.org Schedule,0.00,0.00,,,,,
6,Bay Area 511 Golden Gate Ferry Schedule,0.00,0.00,,,,,
7,Bay Area 511 MVGO Schedule,0.00,0.00,,,,,
8,Bay Area 511 Mission Bay Schedule,0.00,0.00,,,,,
9,Bay Area 511 Rio Vista Delta Breeze Schedule,0.00,0.00,,,,,
