## Charts Exploratory

In [20]:
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import google.auth
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from omegaconf import OmegaConf
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS
from shared_utils import portfolio_utils, rt_dates, publish_utils
import merge_data

readable_dict = OmegaConf.load("readable2.yml")
credentials, project = google.auth.default()

import _report_operator_visuals
import _report_route_dir_visuals
import _scheduled_service_hour_visuals

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)


def formatted(number):
    return "{:,}".format(number)

In [3]:
# Comment out and leave this cell right below pandas
# portfolio_name = "Curry Public Transit"
portfolio_name =  "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"

In [4]:
ROUTE_DIR_MONTH_FILE = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp_report

In [5]:
route_dir_month_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_MONTH_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)

In [6]:
OG_ROUTE_DIR_MONTH_FILE = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp
og_month_route_dir_df = pd.read_parquet(
        f"{RT_SCHED_GCS}{OG_ROUTE_DIR_MONTH_FILE}.parquet",
    filters=[[("portfolio_organization_name", "==", portfolio_name)]]
    )

In [7]:
ROUTE_DIR_QTR_FILE = 'digest/quarterly_schedule_vp_metrics_report'
route_dir_qtr_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_QTR_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)


### Some of the names are Route 1 Route 1, Route 16 Route 16...See what's up.
* When we combine `route_short_name` and `route_long_name` sometimes the values are the same for both columns so we get repeated values.

In [8]:
import yaml
with open("../_shared_utils/shared_utils/portfolio_organization_name.yml", "r") as f:
    PORTFOLIO_ORGANIZATIONS_DICT = yaml.safe_load(f)

In [9]:
CLEANED_ROUTE_NAMING = GTFS_DATA_DICT.schedule_tables.route_identification

clean_route_names = pd.read_parquet(f"{SCHED_GCS}{CLEANED_ROUTE_NAMING}.parquet").pipe(
    portfolio_utils.standardize_portfolio_organization_names,
    PORTFOLIO_ORGANIZATIONS_DICT,
)

In [10]:
clean_route_names.columns

Index(['schedule_gtfs_dataset_key', 'name', 'route_id', 'route_long_name',
       'route_short_name', 'route_desc', 'service_date', 'combined_name',
       'route_id2', 'recent_combined_name', 'recent_route_id2',
       'portfolio_organization_name'],
      dtype='object')

In [11]:
clean_route_names= clean_route_names.loc[clean_route_names.portfolio_organization_name == portfolio_name]

In [12]:
clean_route_names[["recent_combined_name", "route_long_name","route_short_name"]].drop_duplicates()

Unnamed: 0,recent_combined_name,route_long_name,route_short_name
0,Route 11__Route 11,Route 11,Route 11
1,Route 18__Route 18,Route 18,Route 18
2,Route 21__Route 21,Route 21,Route 21
3,Route 7__Route 7,Route 7,Route 7
4,TA__Trolley A,Trolley A,TA
5,Fixed Route__Fixed Route Service,Fixed Route Service,Fixed Route
6,Fillmore__Fillmore Loop,Fillmore Loop,Fillmore
7,Piru__Piru,Piru,Piru
8,Route 16__Route 16,Route 16,Route 16
9,90__90-C Street/CSUCI,90-C Street/CSUCI,90


### Lots of duplicate rows are popping up...Fix that

In [13]:
one_test_route = route_dir_month_df.loc[(route_dir_month_df.Date == '2025-05-14T00:00:00.000000000') & (route_dir_month_df.Route =='89 80-89 Coastal Express')]

In [14]:
len(one_test_route)

14

In [15]:
len(one_test_route.drop_duplicates())

2

In [16]:
one_test_route.drop_duplicates()

Unnamed: 0,Direction (0/1),Period,Average Scheduled Service (trip minutes),# Scheduled Trips,# Realtime Trips,Route,Direction,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,# Early Arrival Trips,# On-Time Trips,# Late Trips,Average VP per Minute,% VP within Scheduled Shape,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Speed (MPH),Portfolio Organization Name,Headway (Minutes),GTFS Availability,Average Stop Distance (Miles),Date
31948,1,All Day,150.0,3,3,89 80-89 Coastal Express,Eastbound,622,621,0,0,3,2.0,0.0,100.0,100.0,2.0,,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",500.0,schedule_and_vp,6.0,2025-05-14
32119,1,Peak,150.0,3,3,89 80-89 Coastal Express,Eastbound,622,621,0,0,3,2.0,0.0,100.0,100.0,2.0,,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",158.0,schedule_and_vp,6.0,2025-05-14


## 

In [17]:
# display(_report_route_dir_visuals.route_filter(route_dir_qtr_df,route_dir_month_df))

## Seeing where the duplicates come from 

In [18]:
date_list = rt_dates.y2025_dates

In [22]:
schedule_route_dir_df = merge_data.concatenate_schedule_by_route_direction(date_list)

In [23]:
# 
schedule_route_dir_df.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'service_date', 'is_express', 'is_ferry', 'is_rail',
       'is_coverage', 'is_local', 'is_downtown_local', 'is_rapid', 'typology',
       'name', 'combined_name', 'recent_combined_name', 'recent_route_id',
       'route_primary_direction'],
      dtype='object')

In [33]:
def preview_one_route_one_date(df:pd.DataFrame):
    try:
        df2 = df.loc[(df.Date == '2025-05-14T00:00:00.000000000') & (df.Route == "87__80-89 Coastal Express")]
    except:
        df2 = df.loc[(df.service_date == '2025-05-14T00:00:00.000000000') & (df.route_id == "4148")]
    return df2

In [34]:
preview_one_route_one_date(schedule_route_dir_df)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,service_date,is_express,is_ferry,is_rail,is_coverage,is_local,is_downtown_local,is_rapid,typology,name,combined_name,recent_combined_name,recent_route_id,route_primary_direction
8026,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,149.67,5.57,3,0.12,2025-05-14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,express,VCTC GMV Schedule,89__80-89 Coastal Express,89 80-89 Coastal Express,4148,Eastbound
8031,1770249a5a2e770ca90628434d4934b1,4148,1.0,peak,149.67,5.57,3,0.38,2025-05-14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,express,VCTC GMV Schedule,89__80-89 Coastal Express,89 80-89 Coastal Express,4148,Eastbound


### Lots of repeated values in speed

In [28]:
speeds_df = merge_data.concatenate_rt_vs_schedule_by_route_direction(date_list)

In [30]:
speeds_df.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'service_date'],
      dtype='object')

In [36]:
preview_one_route_one_date(speeds_df).shape

(14, 23)

In [37]:
len(preview_one_route_one_date(speeds_df).drop_duplicates())

2

In [38]:
# crosswalk_df = merge_data.concatenate_crosswalk_organization(date_list)

In [40]:
crosswalk_df.columns

Index(['schedule_gtfs_dataset_key', 'name', 'schedule_source_record_id',
       'base64_url', 'caltrans_district', 'service_date',
       'portfolio_organization_name'],
      dtype='object')

In [41]:
# crosswalk has no route info
# preview_one_route_one_date(crosswalk_df)

In [42]:
# Applied to df
# standardized_name = merge_data.merge_in_standardized_route_names()
# but can grab the df w/in the func

### More duplicates for rt vs sched

In [44]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
        date_list
    )

In [46]:
(preview_one_route_one_date(df_rt_sched))

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,service_date
12631,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12632,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12633,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12634,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12635,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12636,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12637,1770249a5a2e770ca90628434d4934b1,4148,1.0,all_day,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12666,1770249a5a2e770ca90628434d4934b1,4148,1.0,peak,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12667,1770249a5a2e770ca90628434d4934b1,4148,1.0,peak,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14
12668,1770249a5a2e770ca90628434d4934b1,4148,1.0,peak,622,621,974.28,449.0,1858,0,0,0,3,3,1.91,0.0,0.64,0.64,1.0,1.0,2.17,324.76,2025-05-14


In [47]:
len((preview_one_route_one_date(df_rt_sched)).drop_duplicates())

2