## Route ID 30, Operator 282 (Muni) Test

In [1]:
import math

import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
import altair as alt

# from shared_utils import calitp_color_palette as cp
# from shared_utils import geography_utils, styleguide, utils

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
GCS_DASK_PATH = "gs://calitp-analytics-data/data-analyses/dask_test/"
GCS_RT_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/"

In [5]:
import intake

catalog = intake.open_catalog("./catalog_threshold.yml")

In [6]:
analysis_date = "2022-10-12"

In [7]:
agency = 282

In [8]:
route_id = "30"

In [9]:
# Shapes for Route 30
shape_ids_for_route_30 = [
    "204815",
    "204818",
    "204821",
    "204814",
    "204822",
    "204812",
    "204817",
    "204811",
    "204823",
    "204824",
    "204826",
    "204816",
]

### Route Map
* 12 different versions of this route.
* However all of them are combined into one blob when plotting route.

In [10]:
# Tells me actual route length for each shape id.
routelines = catalog.route_lines.read()

In [11]:
# Filter for specific route & agency
routelines = (
    routelines[
        (
            (routelines.calitp_itp_id == agency)
            & (routelines.shape_id.isin(shape_ids_for_route_30))
        )
    ]
    .drop(columns=["calitp_url_number"])
    .reset_index(drop=True)
    .drop_duplicates()
)

In [12]:
# Calculate length of geometry
routelines = routelines.assign(actual_route_length=(routelines.geometry.length))

In [13]:
len(routelines)

12

In [14]:
routelines.shape_id.nunique()

12

In [93]:
# routelines.explore()

### Trip Map
* https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

In [18]:
trips = catalog.trips.read().drop(columns=["calitp_url_number", "trip_key"])

In [19]:
# Find only rows for route 30 for Muni
trips_282 = trips[
    ((trips.calitp_itp_id == agency) & (trips.route_id == route_id))
].reset_index(drop=True)

In [20]:
trips_282.shape

(282, 12)

In [22]:
trips_282.sort_values("trip_id").head(5)

Unnamed: 0,calitp_itp_id,service_date,trip_id,route_id,direction_id,shape_id,calitp_extracted_at,calitp_deleted_at,route_short_name,route_long_name,route_desc,route_type
165,282,2022-10-12,11136191,30,0,204811,2022-09-26,2099-01-01,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3
29,282,2022-10-12,11136192,30,0,204814,2022-09-25,2099-01-01,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3
50,282,2022-10-12,11136193,30,0,204812,2022-09-26,2099-01-01,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3
261,282,2022-10-12,11136194,30,0,204812,2022-09-25,2099-01-01,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3
12,282,2022-10-12,11136195,30,0,204812,2022-09-25,2099-01-01,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3


In [25]:
datetime_cols = [
    "service_date",
    "calitp_extracted_at",
    "calitp_deleted_at",
]

In [26]:
trips_282 = (
    trips_282.drop(columns=datetime_cols).drop_duplicates().reset_index(drop=True)
)

In [27]:
trips_282.shape, trips_282.shape_id.nunique(), trips_282.trip_id.nunique()

((282, 9), 12, 282)

In [28]:
# Read in crosswalk for route_dir_identifier.
crosswalk = catalog.crosswalk.read()

In [29]:
crosswalk.head()

Unnamed: 0,calitp_itp_id,route_id,direction_id,route_dir_identifier
0,4,10,0,2184919314
1,4,10,1,4114352516
2,4,12,0,2953665424
3,4,12,1,3339348742
4,4,14,0,3864525846


In [30]:
# Join to crosswalk before joining routlines to grab geometry
trips_m_crosswalk = trips_282.merge(
    crosswalk, how="inner", on=["calitp_itp_id", "route_id", "direction_id"]
)

In [31]:
trips_m_crosswalk.head(2)

Unnamed: 0,calitp_itp_id,trip_id,route_id,direction_id,shape_id,route_short_name,route_long_name,route_desc,route_type,route_dir_identifier
0,282,11136447,30,1,204818,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3,2612439400
1,282,11136391,30,1,204821,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3,2612439400


In [32]:
# Merge routes for geometry, actual route length,
m1 = routelines.merge(
    trips_m_crosswalk,
    how="inner",
    on=["calitp_itp_id", "shape_id"],
)

In [33]:
m1.sort_values("trip_id").drop(columns=["geometry"]).head()

Unnamed: 0,calitp_itp_id,shape_id,actual_route_length,trip_id,route_id,direction_id,route_short_name,route_long_name,route_desc,route_type,route_dir_identifier
144,282,204811,3415.07,11136191,30,0,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3,3971062270
21,282,204814,4915.15,11136192,30,0,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3,3971062270
3,282,204812,8141.77,11136193,30,0,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3,3971062270
15,282,204812,8141.77,11136194,30,0,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3,3971062270
0,282,204812,8141.77,11136195,30,0,30,STOCKTON,Weedays 5am-12 midnight Weekends 6am-12 midnight,3,3971062270


#### how many rows in the trips table for that route? when you plot (choosetrip_id column to plot) it on a map, how many do you see?
* 282 trip ids. 
* 282 rows.
* One row per id.
* Always plotting shape_id underneath, just the coloring is different.

In [34]:
len(m1)

282

In [35]:
m1.trip_id.nunique()

282

In [36]:
# m1.loc[m1.trip_id == "11136191"]

In [37]:
m1.explore(
    "trip_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    height=400,
    width=600,
    legend=False,
)

#### how many unique route_ids? when you plot route_id on a map, how many do you see?
* Just one for both questions.

In [38]:
m1.route_id.nunique()

1

In [39]:
"""m1.explore(
    "route_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    height=400,
    width=600,
    legend=True,
)"""

'm1.explore(\n    "route_id",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    height=400,\n    width=600,\n    legend=True,\n)'

### Shapes 
* Direction 1: going towards Financial District.
* Direction 0: going towards Presidio.
* The `route_length`  in `route_lines` is the length of the longest geometry: not for that particular segment.

In [40]:
longest_shape = catalog.longest_shape.read()

In [41]:
routelines.crs == longest_shape.crs

True

In [42]:
m2 = (
    m1.drop(columns=["geometry"])
    .merge(
        longest_shape,
        how="inner",
        on=["calitp_itp_id", "direction_id", "route_id", "route_dir_identifier"],
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [43]:
len(m2), m2.longest_shape_id.nunique(), m2.shape_id.nunique()

(2538, 2, 12)

In [44]:
m2["longest_vs_actual_route"] = m2.actual_route_length / m2.route_length * 100

In [45]:
m3 = m2.set_geometry("geometry_arrowized")

In [46]:
# Count number of segments in `longest_shape`.
m3_test = (
    m3.groupby(
        [
            "calitp_itp_id",
            "trip_id",
            "route_dir_identifier",
            "shape_id",
            "longest_shape_id",
            "actual_route_length",
            "route_length",
            "longest_vs_actual_route",
        ]
    )
    .agg({"segment_sequence": "nunique"})
    .rename(columns={"segment_sequence": "total_segments"})
    .reset_index()
)

In [47]:
# longest_shape.loc[longest_shape.longest_shape_id == '204815'].drop(columns = ["geometry","geometry_arrowized"])

In [48]:
# m3.loc[m3.longest_shape_id == '204815'].drop(columns = ["geometry","geometry_arrowized"]).head()

In [49]:
# m3_test.loc[m3_test.longest_shape_id == '204815'].head()

#### How many unique shape_ids? when you plot shape_id on a map, how many do you see?
* 12 shape ids.
* I can see 2 shape ids on the map (barely).
* However, all 12 are plotted.

In [50]:
m3.shape_id.nunique()

12

In [51]:
"""
m3.explore(
    "shape_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    legend=True,
    width=800,
    height=400,
)"""

'\nm3.explore(\n    "shape_id",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    legend=True,\n    width=800,\n    height=400,\n)'

#### how many unique direction_ids? when you plot direction_id, how many do you see?
* 2 direction ids. 
* Looks a lot like what happens when plotting on `shape_id` column.

In [52]:
m3.direction_id.nunique()

2

In [53]:
"""m3.explore(
    "direction_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    legend=True,
    width=800,
    height=400,
)"""

'm3.explore(\n    "direction_id",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    legend=True,\n    width=800,\n    height=400,\n)'

In [54]:
"""
m3.explore(
    "segment_sequence",
    cmap="tab10",
    style_kwds={"weight": 6},
    legend=False,
    width=800,
    height=400,
)"""

'\nm3.explore(\n    "segment_sequence",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    legend=False,\n    width=800,\n    height=400,\n)'

In [55]:
# m3.loc[m3.direction_id == "0"].explore("segment_sequence", cmap = 'tab10', style_kwds = {'weight':6}, legend = False, width = 800, height = 400)

In [56]:
# m3.loc[m3.direction_id == "1"].explore("segment_sequence", cmap = 'tab20', style_kwds = {'weight':6}, legend = False, width = 800, height = 400)

In [70]:
operator = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/trip_diagnostics_2022-10-12.parquet"
)

In [71]:
operator.head(2)

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,trip_start,trip_end,num_segments_with_vp
0,110,t_5284733_b_77443_tn_1,389670377,2022-10-12 07:35:11,2022-10-12 08:02:12,8
1,110,t_5284733_b_77443_tn_2,389670377,2022-10-12 08:36:09,2022-10-12 09:01:29,8


In [72]:
operator = operator.loc[operator.calitp_itp_id == 282].reset_index(drop=True)

In [73]:
# Merge
m4 = operator[
    [
        "calitp_itp_id",
        "trip_id",
        "route_dir_identifier",
        "num_segments_with_vp",
    ]
].merge(
    m3_test,
    how="inner",
    on=["calitp_itp_id", "trip_id", "route_dir_identifier"],
)

In [74]:
m4["segment_percentage"] = (m4.num_segments_with_vp / m4.total_segments * 100).astype(
    int
)

In [75]:
m4.sort_values("trip_id").sample(5)

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,num_segments_with_vp,shape_id,longest_shape_id,actual_route_length,route_length,longest_vs_actual_route,total_segments,segment_percentage
101,282,11136292,3971062270,5,204814,204815,4915.15,8964.34,54.83,9,55
158,282,11136350,2612439400,9,204822,204822,8694.37,8694.37,100.0,9,100
223,282,11136415,2612439400,5,204821,204822,4683.26,8694.37,53.87,9,55
177,282,11136369,2612439400,9,204822,204822,8694.37,8694.37,100.0,9,100
227,282,11136420,2612439400,5,204821,204822,4683.26,8694.37,53.87,9,55


In [76]:
m4 = m4.assign(
    total_trips=m4.groupby("calitp_itp_id").trip_id.transform("nunique"),
)

In [77]:
final = pd.DataFrame()

In [78]:
def preset_chart_config(chart: alt.Chart) -> alt.Chart:

    chart = chart.properties(width=400, height=200)
    return chart

In [84]:
m4.trip_id.nunique(), m3_test.trip_id.nunique(), m3.trip_id.nunique()

(280, 282, 282)

In [85]:
m4.shape_id.nunique(), m4.longest_shape_id.nunique()

(12, 2)

In [86]:
m4.longest_shape_id.nunique(), m4.shape_id.nunique()

(2, 12)

In [87]:
# check to see if 2 columns are the same
original = set(m4.trip_id.unique().tolist())
final = set(m3_test.trip_id.unique().tolist())
final - original

{'11136339', '11136417'}

In [88]:
operator[operator["trip_id"].isin(["11136339", "11136417"])]

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,trip_start,trip_end,num_segments_with_vp


In [89]:
m3_test[m3_test["trip_id"].isin(["11136339", "11136417"])]

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,shape_id,longest_shape_id,actual_route_length,route_length,longest_vs_actual_route,total_segments
148,282,11136339,2612439400,204822,204822,8694.37,8694.37,100.0,9
226,282,11136417,2612439400,204821,204822,4683.26,8694.37,53.87,9
