# Monthly Trends 

Now that we're beginning to add monthly aggregations to schedule data, let's see how it all plays together. Use this to get out the kinks of combining all our current data products.

* Start with a single day across multiple months.
* `scheduled_service_hours`
    * service hours, scheduled trips in peak vs offpeak
    * average stop spacing
* `speeds` - single day aggregation to route-direction. don't use weekly average...we only produced this for 2 months
    * segment speeds
    * route-direction avg speed with common shape?
* `rt_vs_schedule_metrics`

In [1]:
import altair as alt
import geopandas as gpd
import pandas as pd

from shared_utils import rt_dates
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import (SEGMENT_GCS, SCHED_GCS, 
                                              RT_SCHED_GCS)

## Datasets
* Start with single day for multiple months
* scheduled service hours, avg stop spacing, parallel / intersecting
* vehicle positions speeds
* rt vs sched metrics

In [2]:
months = [
    "mar", "apr", "may", 
    "jun", "jul", "aug", 
    "sep", "oct", "nov", "dec"
]

analysis_date_list = [
    rt_dates.DATES[f"{m}2023"] for m in months
]

## Crosswalk for organization

In [3]:
gtfs_key_to_org = pd.concat(
    [
        pd.read_parquet(
            f"{SEGMENT_GCS}export/"
            f"avg_speeds_stop_segments_{d}_tabular.parquet",
        columns = ["schedule_gtfs_dataset_key", 
                   "org_id", "agency", "base64_url"]
        ).drop_duplicates().assign(service_date = pd.to_datetime(d))
        for d in analysis_date_list
    ], axis=0, ignore_index = True
)

In [4]:
gtfs_key_to_org.service_date.value_counts()

2023-11-15    81
2023-09-13    80
2023-12-13    80
2023-08-15    78
2023-10-11    78
2023-04-12    75
2023-07-12    74
2023-03-15    73
2023-05-17    71
2023-06-14    70
Name: service_date, dtype: int64

## Schedule

In [5]:
def concatenate_schedule_by_route_direction(
    date_list: list
) -> pd.DataFrame:
    df = pd.concat([
        pd.read_parquet(
         f"{RT_SCHED_GCS}schedule_route_direction_metrics_{d}.parquet",
            columns = [
                "schedule_gtfs_dataset_key", 
                "route_id", "direction_id",
                "avg_sched_service_min", 
                "avg_stop_meters",
                "offpeak_n_trips", "peak_n_trips",
                "offpeak_frequency", "peak_frequency"
            ]
        ).assign(
            service_date = pd.to_datetime(d)
        ).astype({"direction_id": "Int64"}) 
        for d in date_list
    ], axis=0, ignore_index=True)
    
    return df

In [6]:
df_schedule = concatenate_schedule_by_route_direction(analysis_date_list)

In [7]:
df_schedule.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,avg_sched_service_min,avg_stop_meters,offpeak_n_trips,peak_n_trips,offpeak_frequency,peak_frequency,service_date
0,a065f1788f6694f048a3908e0adb1b57,ACE Violet,0,32.0,338.29,1.0,3.0,0.06,0.38,2023-03-15
1,e0c18f6b3355da598da912c8461f534d,1968,1,72.0,3843.435,6.0,4.0,0.38,0.5,2023-03-15
2,fd7076620aa807675e79433b840df90f,01,0,50.307692,402.646923,5.0,8.0,0.31,1.0,2023-03-15


In [8]:
df_schedule2 = pd.merge(
    df_schedule,
    gtfs_key_to_org,
    on = ["schedule_gtfs_dataset_key", "service_date"],
    how = "left",
    indicator = True
)

print(df_schedule2._merge.value_counts())

both          27795
left_only      7471
right_only        0
Name: _merge, dtype: int64


## Speeds
### Segment Speeds

In [9]:
df_speeds_segment = pd.concat(
    [pd.read_parquet(
        f"{SEGMENT_GCS}avg_route_speeds_stop_segments_{d}.parquet"
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"})  
     for d in analysis_date_list], 
    axis=0, ignore_index=True
).rename(columns = {"organization_source_record_id": "org_id",
                   "organization_name": "agency"})

In [10]:
df_speeds_segment.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,base64_url,org_id,agency,route_id,direction_id,stop_id,stop_pair,peak_offpeak,weekday_weekend,p50_mph,n_trips,p20_mph,p80_mph,mean_stop_sequence,service_date
0,07d3b79f14cec8099119e1eb649f065b,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,rec3u4aMplqObcoTR,Tahoe Transportation District,12133,0,2501593,2501594-2501593,peak,weekday,24.86,1,24.86,24.86,23.0,2023-03-15
1,07d3b79f14cec8099119e1eb649f065b,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,rec3u4aMplqObcoTR,Tahoe Transportation District,12133,0,2501594,2501595-2501594,peak,weekday,10.01,2,10.01,10.01,22.0,2023-03-15
2,07d3b79f14cec8099119e1eb649f065b,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,rec3u4aMplqObcoTR,Tahoe Transportation District,12133,0,2501595,2501632-2501595,peak,weekday,42.16,1,42.16,42.16,21.0,2023-03-15


In [11]:
df_speeds_segment.dtypes

schedule_gtfs_dataset_key            object
base64_url                           object
org_id                               object
agency                               object
route_id                             object
direction_id                          Int64
stop_id                              object
stop_pair                            object
peak_offpeak                         object
weekday_weekend                      object
p50_mph                             float64
n_trips                               int64
p20_mph                             float64
p80_mph                             float64
mean_stop_sequence                  float64
service_date                 datetime64[ns]
dtype: object

### Route Average Speeds

In [12]:
df_speeds_avg = pd.concat(
    [pd.read_parquet(
        f"{SEGMENT_GCS}trip_summary/route_speeds_{d}.parquet"
    ).drop(
        columns = ["common_shape_id", "geometry"]
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"})  
     for d in analysis_date_list], 
    axis=0, ignore_index=True
)

In [13]:
df_speeds_avg.head(3)

Unnamed: 0,org_id,agency,route_id,route_name,direction_id,time_of_day,speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,base64_url,district_name,service_date
0,recZgWVXkpix390of,San Joaquin Regional Transit District,1,Metro Hopper - Northwest Stockton,0,AM Peak,8.3,4,46.5,123.0,aHR0cDovL3NhbmpvYXF1aW5ydGQuY29tL1JURC1HVEZTL1...,10 - Stockton,2023-03-15
1,recZgWVXkpix390of,San Joaquin Regional Transit District,1,Metro Hopper - Northwest Stockton,0,Midday,9.7,5,52.0,112.3,aHR0cDovL3NhbmpvYXF1aW5ydGQuY29tL1JURC1HVEZTL1...,10 - Stockton,2023-03-15
2,recZgWVXkpix390of,San Joaquin Regional Transit District,1,Metro Hopper - Northwest Stockton,0,PM Peak,8.0,4,52.0,137.0,aHR0cDovL3NhbmpvYXF1aW5ydGQuY29tL1JURC1HVEZTL1...,10 - Stockton,2023-03-15


In [14]:
df_sched_speeds = pd.merge(
    df_schedule2,
    df_speeds_avg,
    on = ["org_id", "agency", "base64_url", "service_date", 
          "route_id", "direction_id"],
    how = "outer",
    indicator = "sched_speeds_avg_merge"
)

df_sched_speeds.sched_speeds_avg_merge.value_counts(normalize=True)

both          0.875607
left_only     0.078402
right_only    0.045991
Name: sched_speeds_avg_merge, dtype: float64

In [15]:
pd.merge(
    df_speeds_avg,
    df_speeds_segment,
    on = ["org_id", "agency", "base64_url", "service_date",
         "route_id", "direction_id"],
    how = "outer",
    indicator = "speeds_avg_seg"
).speeds_avg_seg.value_counts()

both          7911347
right_only      10891
left_only         600
Name: speeds_avg_seg, dtype: int64

## RT vs Schedule

In [16]:
trips_to_route = pd.concat([
    helpers.import_scheduled_trips(
        d,
        columns = ["gtfs_dataset_key", 
                   "trip_instance_key", "route_id", "direction_id"],
        get_pandas = True
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"}) 
    for d in analysis_date_list], 
    axis=0, ignore_index=True
)

In [17]:
rt_sched_metrics = pd.concat([
    pd.read_parquet(
        f"{RT_SCHED_GCS}trip_level_metrics/{d}_metrics.parquet"
    ).assign(service_date = pd.to_datetime(d))
    for d in analysis_date_list
    ], axis=0, ignore_index=True
)

In [18]:
rt_sched_metrics2 = pd.merge(
    rt_sched_metrics,
    trips_to_route,
    on = ["trip_instance_key", "service_date"],
    how = "left",
    #indicator = True
)

# outer merge here has a lot of right onlys, which would be
# expected for all the trips that do not have RT

In [19]:
rt_sched_metrics2.dtypes

trip_instance_key                       object
rt_service_min                         float64
min_w_atleast2_trip_updates              int64
total_pings_for_trip                     int64
total_min_w_gtfs                         int64
total_vp                               float64
vp_in_shape                            float64
speed_mph                              float64
service_minutes                        float64
pings_per_min                          float64
spatial_accuracy_pct                   float64
rt_triptime_w_gtfs_pct                 float64
rt_v_scheduled_trip_time_pct           float64
service_date                    datetime64[ns]
schedule_gtfs_dataset_key               object
route_id                                object
direction_id                             Int64
dtype: object

In [20]:
rt_sched_metrics_route = (rt_sched_metrics2
                          .groupby(["schedule_gtfs_dataset_key", 
                                    "route_id", "direction_id", 
                                    "service_date"], 
                                   observed=True, group_keys=False)
                          .agg({
                              "rt_service_min": "mean",
                              "pings_per_min": "mean",
                              "spatial_accuracy_pct": "mean",
                              "rt_triptime_w_gtfs_pct": "mean",
                          })
                          .reset_index().round(2)
                         )

In [21]:
rt_sched_metrics_route2 = pd.merge(
    rt_sched_metrics_route,
    gtfs_key_to_org,
    on = ["schedule_gtfs_dataset_key", "service_date"],
    how = "left",
)

In [22]:
rt_sched_metrics_route2.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'service_date',
       'rt_service_min', 'pings_per_min', 'spatial_accuracy_pct',
       'rt_triptime_w_gtfs_pct', 'org_id', 'agency', 'base64_url'],
      dtype='object')

In [23]:
pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route2,
    on = ["org_id", "agency", "base64_url", 
          "route_id", "direction_id", "service_date"],
    how = "outer",
    indicator = "sched_speeds_to_rt"
).sched_speeds_to_rt.value_counts()

both          105088
left_only       9042
right_only       255
Name: sched_speeds_to_rt, dtype: int64

In [31]:
df_sched_speeds_metrics = pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route2,
    on = ["schedule_gtfs_dataset_key", "org_id", "agency", "base64_url", 
          "route_id", "direction_id", "service_date"],
    how = "outer",
)

In [32]:
def merge_in_standardized_route_names(df):
    standardized_route_names = pd.read_parquet(
        f"{SCHED_GCS}standardized_route_ids.parquet",
        columns = ["schedule_gtfs_dataset_key", "name", 
                   "route_id", "service_date",
                   "recent_route_id2", "recent_combined_name"
                  ]
    )
    
    df = pd.merge(
        df,
        standardized_route_names,
        on = ["schedule_gtfs_dataset_key", "route_id", "service_date"],
        how = "left",
    )
    
    # Clean up, round columns, get it as close to ready for charts
    df = df.assign(
        route_short_name = (df.recent_combined_name
                            .str.split("__", expand=True)[0]),
        route_long_name = (df.recent_combined_name
                           .str.split("__", expand=True)[1]),
        direction_id = df.direction_id.astype("int"),
        avg_sched_service_min = df.avg_sched_service_min.round(1),
        avg_stop_meters = df.avg_stop_meters.round(1),
    ).drop(
        columns = ["route_id", "recent_combined_name"]
    ).rename(
        columns = {"recent_route_id2": "route_id"}
    )
    
    return df

In [33]:
df_sched_speeds_metrics.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'avg_sched_service_min', 'avg_stop_meters', 'offpeak_n_trips',
       'peak_n_trips', 'offpeak_frequency', 'peak_frequency', 'service_date',
       'org_id', 'agency', 'base64_url', '_merge', 'route_name', 'time_of_day',
       'speed_mph', 'n_trips', 'avg_sched_trip_min', 'avg_rt_trip_min',
       'district_name', 'sched_speeds_avg_merge', 'rt_service_min',
       'pings_per_min', 'spatial_accuracy_pct', 'rt_triptime_w_gtfs_pct'],
      dtype='object')

In [35]:
merge_in_standardized_route_names(df_sched_speeds_metrics).head(10)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,avg_sched_service_min,avg_stop_meters,offpeak_n_trips,peak_n_trips,offpeak_frequency,peak_frequency,service_date,org_id,...,district_name,sched_speeds_avg_merge,rt_service_min,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,name,route_id,route_short_name,route_long_name
0,a065f1788f6694f048a3908e0adb1b57,0,32.0,338.3,1.0,3.0,0.06,0.38,2023-03-15,recC5CT95EufmQCXr,...,,left_only,,,,,Bay Area 511 Santa Clara Transit Schedule,ACE Violet,ACE Violet,West Milpitas
1,e0c18f6b3355da598da912c8461f534d,1,72.0,3843.4,6.0,4.0,0.38,0.5,2023-03-15,,...,,left_only,,,,,El Dorado Schedule,1968,50x,50 Express
2,fd7076620aa807675e79433b840df90f,0,50.3,402.6,5.0,8.0,0.31,1.0,2023-03-15,recZgWVXkpix390of,...,10 - Stockton,both,58.31,2.83,100.0,96.78,San Joaquin Schedule,01,1,Metro Hopper - Northwest Stockton
3,fd7076620aa807675e79433b840df90f,0,50.3,402.6,5.0,8.0,0.31,1.0,2023-03-15,recZgWVXkpix390of,...,10 - Stockton,both,58.31,2.83,100.0,96.78,San Joaquin Schedule,01,1,Metro Hopper - Northwest Stockton
4,fd7076620aa807675e79433b840df90f,0,50.3,402.6,5.0,8.0,0.31,1.0,2023-03-15,recZgWVXkpix390of,...,10 - Stockton,both,58.31,2.83,100.0,96.78,San Joaquin Schedule,01,1,Metro Hopper - Northwest Stockton
5,7e015887964432c82ce7e735c2753f86,1,41.1,426.6,12.0,16.0,0.75,2.0,2023-03-15,reckQmUdXUzHFmlVf,...,07 - Los Angeles,both,63.54,2.99,95.91,99.06,VCTC GMV Schedule,3408,Route 21,Route 21
6,7e015887964432c82ce7e735c2753f86,1,41.1,426.6,12.0,16.0,0.75,2.0,2023-03-15,reckQmUdXUzHFmlVf,...,07 - Los Angeles,both,63.54,2.99,95.91,99.06,VCTC GMV Schedule,3408,Route 21,Route 21
7,7e015887964432c82ce7e735c2753f86,1,41.1,426.6,12.0,16.0,0.75,2.0,2023-03-15,reckQmUdXUzHFmlVf,...,07 - Los Angeles,both,63.54,2.99,95.91,99.06,VCTC GMV Schedule,3408,Route 21,Route 21
8,7e015887964432c82ce7e735c2753f86,1,41.1,426.6,12.0,16.0,0.75,2.0,2023-03-15,reckQmUdXUzHFmlVf,...,07 - Los Angeles,both,63.54,2.99,95.91,99.06,VCTC GMV Schedule,3408,Route 21,Route 21
9,587e730fac4db21d54037e0f12b0dd5d,1,24.0,429.4,,1.0,,0.12,2023-03-15,recjnaKVDEgulsko3,...,04 - Oakland,both,25.82,1.94,76.0,100.0,Bay Area 511 County Connection Schedule,635,635,ROUTE 635
