## Where is January 2025 data?

In [1]:
import _section1_utils
import _section2_utils
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling
from shared_utils import rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = rt_dates.y2025_dates

In [4]:
analysis_date_list_all = (
    rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates
)

In [5]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [6]:
og_df = pd.read_parquet(schd_vp_url)

In [7]:
og_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_primary_direction',
       'schedule

In [8]:
all_ops_jan_only = og_df.loc[og_df.service_date == "2025-01-15T00:00:00.000000000"]

In [9]:
all_ops_jan_only.service_date.unique()

array(['2025-01-15T00:00:00.000000000'], dtype='datetime64[ns]')

In [10]:
all_ops_jan_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15312 entries, 11 to 328577
Data columns (total 46 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   schedule_gtfs_dataset_key        15312 non-null  object        
 1   direction_id                     15312 non-null  float64       
 2   time_period                      15312 non-null  object        
 3   avg_scheduled_service_minutes    14376 non-null  float64       
 4   avg_stop_miles                   14233 non-null  float64       
 5   n_scheduled_trips                15312 non-null  int64         
 6   frequency                        14376 non-null  float64       
 7   is_express                       13608 non-null  float64       
 8   is_rapid                         13608 non-null  float64       
 9   is_rail                          13608 non-null  float64       
 10  is_coverage                      13608 non-null  float64

In [11]:
all_ops_jan_only.time_period.value_counts()

all_day    5430
peak       5096
offpeak    4786
Name: time_period, dtype: int64

In [12]:
all_ops_jan_only[
    [
        "organization_name",
        "route_id",
        "time_period",
        "is_early",
        "is_ontime",
        "is_late",
    ]
].sample(10)

Unnamed: 0,organization_name,route_id,time_period,is_early,is_ontime,is_late
283948,Long Beach Transit,94,all_day,0,0,2
148863,City and County of San Francisco,J,offpeak,1,17,28
187583,"University of California, Berkeley",cent,peak,0,0,0
156776,Antelope Valley Transit Authority,9,peak,0,2,2
296927,Riverside Transit Agency,1,offpeak,1,21,9
7291,Los Angeles County Metropolitan Transportation Authority,268,peak,0,0,13
242153,City of Los Angeles,1524,peak,0,0,1
107987,San Mateo County Transit District,14,offpeak,0,1,0
288248,City of San Luis Obispo,11789,peak,0,8,0
280873,City of Santa Monica,3806,offpeak,0,0,0


### In the original dataframe, not loaded using the function in  `_merge_data.section_2_utils()` there are peak/offpeak values in `time_period`
* There should only be 6 rows for each route (ideally)
* Dir 1: all day, peak, offpeak
* Dir 0: all day, peak, offpeak.

In [13]:
ac_transit_og_one_route = all_ops_jan_only.loc[
    (all_ops_jan_only.organization_name == "Alameda-Contra Costa Transit District")
    & (all_ops_jan_only.route_id == "200")
]

In [14]:
ac_transit_og_one_route.shape

(6, 46)

In [15]:
ac_transit_og_one_route.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,route_long_name,route_short_name,route_combined_name,route_id,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction,schedule_source_record_id,name
225323,c499f905e33929a641f083dad55c521e,0.0,all_day,51.62,3.02,45,1.88,0.0,0.0,0.0,1.0,1.0,0.0,2025-01-15,downtown_local,3122,3080,5886.62,2323.0,8863,8821,0,1,44,45,1.51,0.99,0.53,0.52,1.0,1.0,2.53,130.81,schedule_and_vp,17.17,Decoto - Newark Blvd. - Mowry,200,200 Decoto - Newark Blvd. - Mowry,200,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BQw==,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,04 - Oakland,Southbound,recJjD8JT53sK302o,Bay Area 511 AC Transit Schedule
225345,c499f905e33929a641f083dad55c521e,0.0,offpeak,51.62,3.02,24,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2025-01-15,downtown_local,1608,1587,4395.64,1206.0,4567,4551,0,1,23,24,1.04,1.0,0.37,0.36,1.0,1.0,3.64,183.15,schedule_and_vp,17.98,Decoto - Newark Blvd. - Mowry,200,200 Decoto - Newark Blvd. - Mowry,200,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BQw==,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,04 - Oakland,Southbound,recJjD8JT53sK302o,Bay Area 511 AC Transit Schedule


In [16]:
ac_transit_og_one_route.time_period.value_counts()

all_day    2
offpeak    2
peak       2
Name: time_period, dtype: int64

In [17]:
ac_transit_og_one_route.sched_rt_category.unique()

['schedule_and_vp']
Categories (3, object): ['schedule_only', 'vp_only', 'schedule_and_vp']

In [18]:
ac_transit_og_one_route.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_primary_direction',
       'schedule

In [19]:
all_ops_jan_only.loc[
    (all_ops_jan_only.organization_name == "City and County of San Francisco")].route_id.unique()

array(['1', '12', '14', '14R', '18', '19', '1X', '2', '21', '22', '23',
       '24', '27', '28', '28R', '29', '30', '31', '33', '35', '36', '37',
       '38', '38R', '39', '43', '44', '45', '48', '49', '5', '52', '54',
       '55', '56', '57', '58', '5R', '6', '66', '67', '7', '714', '8',
       '8AX', '8BX', '9', '90', '9R', 'CA', 'F', 'FBUS', 'J', 'K', 'KBUS',
       'KLM', 'L', 'LBUS', 'LOWL', 'M', 'N', 'NBUS', 'NOWL', 'PH', 'PM',
       '15', '25', '91', 'T', 'TBUS'], dtype=object)

In [20]:
sf_og_one_route = all_ops_jan_only.loc[
    (all_ops_jan_only.organization_name == "City and County of San Francisco")
    & (all_ops_jan_only.route_id == "22")
]

In [21]:
sf_og_one_route.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,route_long_name,route_short_name,route_combined_name,route_id,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction,schedule_source_record_id,name
143933,7cc0cb1871dfd558f11a2885c145d144,0.0,all_day,46.87,2.35,186,7.75,0.0,0.0,0.0,0.0,1.0,0.0,2025-01-15,downtown_local,9864,9719,13857.67,8658.0,29100,28584,3,80,99,182,2.1,0.98,0.71,0.7,1.0,1.0,1.6,76.14,schedule_and_vp,5.76,FILLMORE,22,22 FILLMORE,22,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1TRg==,rechaapWbeffO33OX,City and County of San Francisco,04 - Oakland,Southbound,recHD22phgJs34JHP,Bay Area 511 Muni Schedule


In [22]:
sf_og_one_route[["sched_rt_category","time_period"]]

Unnamed: 0,sched_rt_category,time_period
143933,schedule_and_vp,all_day
143953,schedule_and_vp,offpeak
143973,schedule_and_vp,peak
143993,schedule_and_vp,all_day
144013,schedule_and_vp,offpeak
144033,schedule_and_vp,peak


### Something has gone wrong with the merges that I fixed awhile back.

In [23]:
ac_transit_og_one_route[["sched_rt_category", "time_period"]]

Unnamed: 0,sched_rt_category,time_period
225323,schedule_and_vp,all_day
225345,schedule_and_vp,offpeak
225367,schedule_and_vp,peak
225389,schedule_and_vp,all_day
225411,schedule_and_vp,offpeak
225433,schedule_and_vp,peak


In [24]:
ac_transitonly = _section2_utils.load_schedule_vp_metrics(
    "Alameda-Contra Costa Transit District"
)

In [25]:
ac_jan_only = ac_transitonly.loc[ac_transitonly.Date == "2025-01-15T00:00:00.000000000"]

In [26]:
ac_jan_only.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,dir_0_1,Period,Average Scheduled Service (trip minutes),Average Stop Distance (miles),# scheduled trips,Trips per Hour,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,Date,Route typology,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,Aggregate Actual Service Minutes,Aggregate Scheduled Service Minutes (all trips),# VP,# VP within Scheduled Shape,# Early Arrival Trips,# On-Time Trips,# Late Trips,# Trips with VP,Average VP per Minute,% VP within Scheduled Shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Average Actual Service (Trip Minutes),GTFS Availability,Speed (MPH),route_long_name,route_short_name,Route,Route ID,Base64 Encoded Feed URL,Organization ID,Organization,District,Direction,schedule_source_record_id,Transit Operator,ruler_100_pct,ruler_for_vp_per_min,headway_in_minutes
511,c499f905e33929a641f083dad55c521e,0.0,all_day,31.4,3.5,63,2.62,0.0,0.0,0.0,0.0,1.0,0.0,2025-01-15,downtown_local,3130,3084,5913.36,1978.0,8870,8870,0,1,62,63,1.5,100.0,53.0,52.0,100.0,100.0,2.99,93.86,schedule_and_vp,13.02,E. 14th St. - Mission,10,10 E. 14th St. - Mission,10,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BQw==,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,04 - Oakland,Eastbound,recJjD8JT53sK302o,Bay Area 511 AC Transit Schedule,100,2,22.9
533,c499f905e33929a641f083dad55c521e,0.0,offpeak,31.4,3.5,36,1.5,0.0,0.0,0.0,0.0,1.0,0.0,2025-01-15,downtown_local,1752,1727,4546.98,1091.0,4972,4972,0,1,35,36,1.09,100.0,38.0,38.0,100.0,100.0,4.17,126.3,schedule_and_vp,13.7,E. 14th St. - Mission,10,10 E. 14th St. - Mission,10,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BQw==,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,04 - Oakland,Eastbound,recJjD8JT53sK302o,Bay Area 511 AC Transit Schedule,100,2,40.0


In [27]:
ac_jan_only.Period.value_counts()

all_day    199
peak       167
offpeak    146
Name: Period, dtype: int64

### Timeliness only has `all_day` values.

In [34]:
timeliness_test = _section2_utils.timeliness_trips(ac_jan_only)

In [35]:
timeliness_test.head(2)

Unnamed: 0,Date,Organization,Route,Period,Direction,dir_0_1,variable,value
0,2025-01-15,Alameda-Contra Costa Transit District,10 E. 14th St. - Mission,offpeak,Eastbound,0.0,# Early Arrival Trips,0
1,2025-01-15,Alameda-Contra Costa Transit District,10 E. 14th St. - Mission,peak,Eastbound,0.0,# Early Arrival Trips,0


In [36]:
timeliness_test.Date.unique()

array(['2025-01-15T00:00:00.000000000'], dtype='datetime64[ns]')

In [37]:
timeliness_to_keep = [
    "Date",
    "Organization",
    "Direction",
    "Period",
    "Route",
    "# Early Arrival Trips",
    "# On-Time Trips",
    "# Late Trips",
    "dir_0_1",
]

In [39]:
ac_jan_only[timeliness_to_keep].head(10)

Unnamed: 0,Date,Organization,Direction,Period,Route,# Early Arrival Trips,# On-Time Trips,# Late Trips,dir_0_1
511,2025-01-15,Alameda-Contra Costa Transit District,Eastbound,all_day,10 E. 14th St. - Mission,0,1,62,0.0
533,2025-01-15,Alameda-Contra Costa Transit District,Eastbound,offpeak,10 E. 14th St. - Mission,0,1,35,0.0
555,2025-01-15,Alameda-Contra Costa Transit District,Eastbound,peak,10 E. 14th St. - Mission,0,0,27,0.0
577,2025-01-15,Alameda-Contra Costa Transit District,Westbound,all_day,10 E. 14th St. - Mission,0,5,61,1.0
599,2025-01-15,Alameda-Contra Costa Transit District,Westbound,offpeak,10 E. 14th St. - Mission,0,4,35,1.0
621,2025-01-15,Alameda-Contra Costa Transit District,Westbound,peak,10 E. 14th St. - Mission,0,1,26,1.0
643,2025-01-15,Alameda-Contra Costa Transit District,Southbound,all_day,12 MLK Jr. - Temescal - Grand,3,3,39,0.0
665,2025-01-15,Alameda-Contra Costa Transit District,Southbound,offpeak,12 MLK Jr. - Temescal - Grand,1,2,22,0.0
687,2025-01-15,Alameda-Contra Costa Transit District,Southbound,peak,12 MLK Jr. - Temescal - Grand,2,1,17,0.0
709,2025-01-15,Alameda-Contra Costa Transit District,Northbound,all_day,12 MLK Jr. - Temescal - Grand,4,17,24,1.0


In [41]:
ac_jan_only.Period.value_counts()

all_day    199
peak       167
offpeak    146
Name: Period, dtype: int64

### Timeliness metrics is located in `rt_segment_speeds/segment_speed_utils/metrics.py`

### None of the "route identifiers" are showing up for January 2025

In [42]:
df.loc[df.service_date == "2025-01-15T00:00:00.000000000"].head()

NameError: name 'df' is not defined

In [None]:
df.columns

In [None]:
df_test.info()

In [None]:
df_test.route_id = df_test.route_id.fillna("No Route Name")

In [None]:
df_test.columns

In [None]:
df_test.service_date.unique()

In [None]:
df_test.loc[
    (df_test.organization_name == "Monterey-Salinas Transit")
    & (df_test.service_date == "2025-01-15T00:00:00.000000000")
][["route_id"]].nunique()

In [None]:
# Define a function to highlight the string "No Route Name" in red
def highlight_no_route_name(val):
    color = "red" if val == "No Route Name" else ""
    return f"color: white; background-color: {color}" if color else ""

In [None]:
january_only = df_test.loc[(df_test.service_date == "2025-01-15T00:00:00.000000000")]

In [None]:
no_route_names = (
    january_only.groupby(["schedule_gtfs_dataset_key", "organization_name", "route_id"])
    .agg({"direction_id": "count"})
    .reset_index()
)

In [None]:
# Apply the highlighting function to the 'Route Name' column
no_route_names.style.applymap(highlight_no_route_name, subset=["route_id"])

### Test: removing `name` from the `merge` with `df_crosswalk` in line 259 in `gtfs_digest/merge_data`

### Find out where it's not merging

In [None]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [None]:
df_sched.service_date.unique()

In [None]:
df_sched.loc[df_sched.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [None]:
df_avg_speeds.loc[df_avg_speeds.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [None]:
df_rt_sched.loc[df_rt_sched.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [None]:
df_crosswalk.loc[df_crosswalk.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
primary_typology = merge_data.set_primary_typology(df_sched)

In [None]:
primary_typology.head(2)

In [None]:
df_schedule2 = pd.merge(
    df_sched, primary_typology, on=merge_data.route_time_cols, how="left"
)

In [None]:
df_schedule2.loc[df_schedule2.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df1 = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=merge_data.route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
    df_avg_speeds,
    on=merge_data.route_time_cols + ["service_date"],
    how="outer",
)

In [None]:
df1.loc[df.service_date == "2025-01-15T00:00:00.000000000"].head(2)

In [None]:
df1.info()

In [None]:
df1.sched_rt_category.unique()

In [None]:
df_wo_standardized_route = df1.assign(
    sched_rt_category=df1.sched_rt_category.map(
        gtfs_schedule_wrangling.sched_rt_category_dict
    )
)

In [None]:
df_w_standardized_route = df_wo_standardized_route.pipe(
    merge_data.merge_in_standardized_route_names,
)

In [None]:
df_w_standardized_route.info()

#### `merge_in_standardized_route_names` isn't working
* `route_names_df` doesn't have any values for January 2025.
* Rerun `gtfs_funnel/clean_route_naming.py`

In [None]:
keep_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "route_id",
    "service_date",
]

CLEAN_ROUTES = GTFS_DATA_DICT.schedule_tables.route_identification

route_names_df = pd.read_parquet(f"{SCHED_GCS}{CLEAN_ROUTES}.parquet")

In [None]:
route_names_df.info()

In [None]:
route_names_df.service_date.unique()

In [None]:
from segment_speed_utils import gtfs_schedule_wrangling, time_series_utils

In [None]:
route_names_df = time_series_utils.clean_standardized_route_names(
    route_names_df
).drop_duplicates()

In [None]:
route_names_df.head(2)

In [None]:
route_names_df.service_date.unique()

In [None]:
if "name" in df_wo_standardized_route.columns:
    df_wo_standardized_route = df.drop(columns="name")