In [1]:
import json

import pandas as pd
import requests
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## Catastrophic

In [8]:
def load_catastrophic_errors(excel_file:str):
    df = pd.read_excel(excel_file)
    df = to_snakecase(df)
    
    # Don't look at LACMTA stuff
    no_lacmta = df[~df["dim_gtfs_datasets_→_uri"].str.contains("LACMTA")]
    
    no_lacmta2 = no_lacmta[
    [
        "dim_provider_gtfs_data_→_service_name",
        "dim_county_geography_→_caltrans_district",
        "date",
    ]].sort_values(
    ["dim_provider_gtfs_data_→_service_name", "date"], ascending=[False, False])
    
    return no_lacmta2

In [9]:
cat_df = load_catastrophic_errors("./feed_info_expired_feeds_2023-09-21T19_13_29.263825Z.xlsx")

  warn("Workbook contains no default style, apply openpyxl's default")


In [14]:
def load_airtable(csv_file:str):
    df = to_snakecase(pd.read_csv(csv_file))
    
    df = df.fillna("None")
    
    df = (df[[
            "description",
            "gtfs_datasets",
            "services"]]
    .sort_values(["gtfs_datasets", "services"])
    .reset_index(drop=True))
    
    df["airtable_ticket"] = "Yes"
    
    return df

In [15]:
airtable = load_airtable("./Transit Data Quality Issues-Open Issues by Distrcit (1).csv")

In [16]:
def summarize_cat(catastrophic_data:pd.DataFrame, airtable_data:pd.DataFrame) -> pd.DataFrame:
    
    cat_summary = (
    catastrophic_data.groupby(["dim_provider_gtfs_data_→_service_name"])
    .agg({"date": "count"})
    .reset_index()
    .rename(columns={"date": "# of days with expired feed"}))
    
    display(cat_summary)
    
    m1 = pd.merge(
    cat_summary,
    airtable_data,
    left_on=["dim_provider_gtfs_data_→_service_name"],
    right_on=["services"],
    how="left",)
    
    display(m1)

In [17]:
summarize_cat(cat_df,airtable)

Unnamed: 0,dim_provider_gtfs_data_→_service_name,# of days with expired feed
0,Morro Bay Transit,14
1,Plumas Transit Systems,14
2,Sage Stage Intercity,2
3,TRACER,14


Unnamed: 0,dim_provider_gtfs_data_→_service_name,# of days with expired feed,description,gtfs_datasets,services,airtable_ticket
0,Morro Bay Transit,14,City of Morro Bay: Follow-up about GTFS Schedule,Morro Bay Schedule,Morro Bay Transit,Yes
1,Plumas Transit Systems,14,"Based on the catastrophic errors dashboard, Plumas has not produced data since September 2. Per Evan, ""Plumas has a feed_info.txt file that says the feed was only valid until September 1 while the calendar says the feed is valid all the way until December 1. A strict interpretation of the GTFS spec says that any service beyond the feed end date is merely advisory, but some trip planners may still chose to display the service up until the end date."" This is the second ticket opened for Plumas.",Plumas Schedule,Plumas Transit Systems,Yes
2,Sage Stage Intercity,2,,,,
3,TRACER,14,City of Tracy: GTFS Schedule Feed Expired,Tracy Schedule,TRACER,Yes


## Incompleteness

In [26]:
def load_tu(tu_excel_file:str):
    df = to_snakecase(pd.read_excel(tu_excel_file))
    df = (
    df[df["%_of_trips_with_tu_messages"] < 41].sort_values(
        ["%_of_trips_with_tu_messages"]
    )).reset_index(drop=True)
    return df

In [23]:
def load_vp(vp_excel_file:str):
    df = to_snakecase(pd.read_excel(vp_excel_file))
    
    df = (
    df[df["%_of_trips_with_vp_messages"] < 41].sort_values(
        ["%_of_trips_with_vp_messages"]
    )).reset_index(drop=True)
    return df

In [24]:
tu_df = load_incomplete_tu( "./gtfs_rt_trip_updates_completeness__last_14_days__2023-09-25T18_11_34.892466Z.xlsx")

  warn("Workbook contains no default style, apply openpyxl's default")


In [25]:
vp_df = load_incomplete_tu("gtfs_rt_vehicle_positions_completeness__last_14_days__2023-09-25T18_11_37.967895Z.xlsx")

  warn("Workbook contains no default style, apply openpyxl's default")


In [44]:
def api_511(excel_file:str)-> pd.DataFrame:
    # Read in 511
    df = to_snakecase(pd.read_excel(excel_file))
    df = df.rename(columns={df.columns[0]: "new"})
    
    # Only keep rows that have the string name or monitored
    df = df[df["new"].str.contains(("Name|Monitored"))].reset_index(drop=True)
    
    # Get rid of random characters
    df.new = (
    df.new.str.replace(">", "")
    .str.replace("ShortName", "")
    .str.replace("Name", "")
    .str.replace("Monitored", "")
    .str.replace("<", "")
    .str.replace("/", ""))
    
    # display(df)
    print('These are Bay Area feeds that keep track of RT')
    display(df[(df.new == "true").shift(1).fillna(False)])

In [45]:
api_511("API511.xlsx")

This is only Bay Area feeds that keep track of RT


Unnamed: 0,new
13,AC TRANSIT
22,Bay Area Rapid Transit
25,Caltrain
37,County Connection
40,Dumbarton Express Consortium
43,Emery Go-Round
46,FAST
52,Golden Gate Transit
55,Livermore Amador Valley Transit Authority
58,Marin Transit


In [63]:
def incomplete(tu_excel_file:str, vp_excel_file:str, airtable: pd.DataFrame):
    tu_df = load_tu(tu_excel_file)
    vp_df = load_vp(vp_excel_file)
    
    incomplete = pd.merge(tu_df, vp_df, on="name", how="outer")
    incomplete = incomplete.sort_values(["name"]).reset_index(drop=True)
    
    incomplete = incomplete.fillna("OK")
    
    #incomplete.name = incomplete.name.str.replace('Schedule','')
    incomplete2 = (pd.merge(
    incomplete,
    airtable,
    left_on="name",
    right_on="gtfs_datasets",
    how="left")
    .sort_values("name")
    .fillna('NA')
    )
    display(incomplete2)
    

In [64]:
incomplete("./gtfs_rt_trip_updates_completeness__last_14_days__2023-09-25T18_11_34.892466Z.xlsx",
                           "gtfs_rt_vehicle_positions_completeness__last_14_days__2023-09-25T18_11_37.967895Z.xlsx",
                            airtable)

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,name,%_of_trips_with_tu_messages,%_of_trips_with_vp_messages,description,gtfs_datasets,services,airtable_ticket
0,Anaheim Resort Schedule,21.11,17.73,Anaheim Transportation Network: GTFS-Schedule issue: thousands of duplicate trips on Toy Story Line,Anaheim Resort Schedule,Anaheim Resort Transportation,Yes
1,Bay Area 511 ACE Schedule,0.00,0.00,,,,
2,Bay Area 511 Angel Island-Tiburon Ferry Schedule,0.00,0.00,,,,
3,Bay Area 511 BART Schedule,39.31,0.00,,,,
4,Bay Area 511 Capitol Corridor Schedule,0.00,0.00,,,,
5,Bay Area 511 Commute.org Schedule,0.00,0.00,,,,
6,Bay Area 511 Golden Gate Ferry Schedule,0.00,0.00,,,,
7,Bay Area 511 MVGO Schedule,0.00,0.00,,,,
8,Bay Area 511 Mission Bay Schedule,0.00,0.00,,,,
9,Bay Area 511 Rio Vista Delta Breeze Schedule,0.00,0.00,,,,
