## Route ID 30, Operator 282 (Muni) Test

In [1]:
import geopandas as gpd
import pandas as pd
from calitp.sql import to_snakecase
from shared_utils import geography_utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_DASK_PATH = "gs://calitp-analytics-data/data-analyses/dask_test/"
GCS_RT_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/"

In [4]:
analysis_date = "2022-10-12"

In [5]:
agency = 282

In [6]:
route_id = "30"

In [7]:
# Shapes for Route 30
shape_ids_for_route_30 = [
    "204815",
    "204818",
    "204821",
    "204814",
    "204822",
    "204812",
    "204817",
    "204811",
    "204823",
    "204824",
    "204826",
    "204816",
]

### Route Map

In [8]:
# Tells me actual route length for each shape id.
routelines = gpd.read_parquet(
    f"{GCS_RT_PATH}compiled_cached_views/routelines_{analysis_date}.parquet"
)

In [9]:
# Filter for specific route & agency
routelines = (
    routelines[
        (
            (routelines.calitp_itp_id == agency)
            & (routelines.shape_id.isin(shape_ids_for_route_30))
        )
    ]
    .drop(columns=["calitp_url_number"])
    .reset_index(drop=True)
    .drop_duplicates()
)

In [10]:
# Calculate length of geometry
#routelines = routelines.assign(
#    actual_route_length=(
#        routelines.geometry.to_crs(geography_utils.CA_NAD83Albers).length
#    )
#)

In [11]:
routelines.crs

<Derived Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [12]:
# Calculate length of geometry
routelines = routelines.assign(
    actual_route_length=(
        routelines.geometry.length
    )
)

In [13]:
len(routelines)

12

In [14]:
routelines.shape_id.nunique()

12

In [15]:
routelines.head(1)

Unnamed: 0,calitp_itp_id,shape_id,geometry,actual_route_length
0,282,204812,"LINESTRING (-210538.413 -23842.199, -210509.478 -23814.795, -210487.030 -23793.455, -210438.535 -23747.530, -210571.438 -23606.506, -210599.798 -23576.321, -210636.899 -23536.907, -210675.713 -23495.671, -210703.650 -23466.164, -210749.573 -23417.297, -210789.158 -23375.263, -210835.769 -23325.822, -210883.492 -23275.130, -210968.148 -23185.471, -211013.456 -23136.730, -211099.870 -23043.690, -211187.389 -22952.622, -211233.335 -22904.864, -211280.480 -22855.852, -211319.332 -22816.391, -211346.235 -22787.909, -211352.937 -22746.927, -211357.393 -22711.007, -211363.822 -22659.246, -211369.628 -22607.167, -211375.867 -22554.855, -211382.906 -22502.856, -211415.303 -22507.261, -211520.349 -22521.388, -211657.970 -22539.912, -211672.129 -22435.801, -211686.384 -22332.020, -211700.925 -22225.675, -211715.094 -22122.007, -211728.076 -22023.597, -211741.064 -21925.409, -211753.867 -21826.891, -211766.940 -21728.590, -211776.345 -21659.071, -211780.860 -21625.596, -211794.896 -21520.264, -211802.728 -21468.467, -211810.412 -21417.785, -211912.860 -21323.327, -211977.060 -21264.092, -212013.418 -21230.696, -212113.547 -21138.630, -212145.092 -21109.692, -212214.707 -21045.646, -212314.234 -20954.260, -212363.133 -20909.200, -212415.211 -20861.168, -212481.124 -20800.439, -212516.188 -20768.185, -212615.331 -20675.804, -212645.170 -20680.158, -212790.887 -20699.903, -212935.411 -20721.121, -213080.947 -20740.642, -213225.957 -20760.174, -213211.997 -20864.284, -213197.881 -20969.176, -213184.070 -21072.281, -213329.353 -21091.916, -213474.543 -21111.329, -213619.648 -21130.852, -213764.842 -21150.371, -213910.127 -21169.995, -214055.323 -21189.509, -214202.813 -21209.294, -214293.375 -21221.534, -214345.452 -21228.535, -214488.002 -21247.664, -214559.055 -21257.178, -214635.673 -21267.549, -214782.990 -21287.329, -214930.399 -21307.215, -214944.259 -21203.438, -214958.117 -21099.549, -214972.137 -20995.212, -214985.994 -20891.324, -214999.932 -20787.211, -214852.798 -20767.431, -214838.768 -20871.434, -214824.822 -20975.325, -214810.802 -21079.773, -214796.852 -21183.552, -214782.990 -21287.329)",8141.77


In [16]:
routelines.explore()

### Trip Map
* https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

In [17]:
trips = pd.read_parquet(
    f"{GCS_RT_PATH}compiled_cached_views/trips_{analysis_date}_v1.parquet"
).drop(columns = ["calitp_url_number", "trip_key"])

In [18]:
trips = trips.assign(
    departure_time=pd.to_datetime(trips.trip_first_departure_ts, unit="s"),
    end_time=pd.to_datetime(trips.trip_last_arrival_ts, unit="s"),
)

In [19]:
# Find only rows for route 30 for Muni
trips_282 = trips[
    ((trips.calitp_itp_id == agency) & (trips.route_id == route_id))
].reset_index(drop=True)

In [20]:
trips_282.shape

(564, 13)

* `Calitp_deleted_at` is different between the same trip ids.
* `Departure_time` not displaying time? 

In [21]:
trips_282.loc[trips_282.trip_id == '11136193']

Unnamed: 0,calitp_itp_id,service_date,trip_id,route_id,direction_id,shape_id,calitp_extracted_at,calitp_deleted_at,trip_first_departure_ts,trip_last_arrival_ts,service_hours,departure_time,end_time
78,282,2022-10-12,11136193,30,0,204812,2022-09-25,2099-01-01,86400,88560,0.6,1970-01-02,1970-01-02 00:36:00
483,282,2022-10-12,11136193,30,0,204812,2022-09-26,2022-11-17,86400,88560,0.6,1970-01-02,1970-01-02 00:36:00


In [22]:
trips_282.sort_values('trip_id').head(5)

Unnamed: 0,calitp_itp_id,service_date,trip_id,route_id,direction_id,shape_id,calitp_extracted_at,calitp_deleted_at,trip_first_departure_ts,trip_last_arrival_ts,service_hours,departure_time,end_time
555,282,2022-10-12,11136191,30,0,204811,2022-09-26,2022-11-17,32520,33840,0.37,1970-01-01 09:02:00,1970-01-01 09:24:00
51,282,2022-10-12,11136191,30,0,204811,2022-09-25,2099-01-01,32520,33840,0.37,1970-01-01 09:02:00,1970-01-01 09:24:00
194,282,2022-10-12,11136192,30,0,204814,2022-09-25,2099-01-01,19080,20460,0.38,1970-01-01 05:18:00,1970-01-01 05:41:00
392,282,2022-10-12,11136192,30,0,204814,2022-09-26,2022-11-17,19080,20460,0.38,1970-01-01 05:18:00,1970-01-01 05:41:00
483,282,2022-10-12,11136193,30,0,204812,2022-09-26,2022-11-17,86400,88560,0.6,1970-01-02 00:00:00,1970-01-02 00:36:00


In [23]:
datetime_cols = [
    "service_date",
    "calitp_extracted_at",
    "calitp_deleted_at",
    "trip_first_departure_ts",
    "trip_last_arrival_ts",
    "service_hours",
    "departure_time",
    "end_time",
]

In [27]:
trips_282 = trips_282.drop(columns=datetime_cols).drop_duplicates().reset_index(drop=True)

In [28]:
# Read in crosswalk for route_dir_identifier.
crosswalk = pd.read_parquet(
    f"{GCS_DASK_PATH}segments_route_direction_crosswalk.parquet"
)

In [29]:
# Join to crosswalk before joining routlines to grab geometry
trips_m_crosswalk = trips_282.merge(
    crosswalk, how="inner", on=["calitp_itp_id", "route_id", "direction_id"]
)

In [30]:
trips_m_crosswalk.head(2)

Unnamed: 0,calitp_itp_id,trip_id,route_id,direction_id,shape_id,route_dir_identifier
0,282,11136210,30,0,204815,3971062270
1,282,11136289,30,0,204814,3971062270


In [31]:
# Merge routes for geometry, actual route length,
m1 = routelines.merge(
    trips_m_crosswalk,
    how="inner",
    on=["calitp_itp_id", "shape_id"],
)

In [32]:
m1.sort_values('trip_id').drop(columns = ["geometry"]).head()

Unnamed: 0,calitp_itp_id,shape_id,actual_route_length,trip_id,route_id,direction_id,route_dir_identifier
142,282,204811,3415.07,11136191,30,0,3971062270
48,282,204814,4915.15,11136192,30,0,3971062270
4,282,204812,8141.77,11136193,30,0,3971062270
0,282,204812,8141.77,11136194,30,0,3971062270
13,282,204812,8141.77,11136195,30,0,3971062270


#### how many rows in the trips table for that route? when you plot (choosetrip_id column to plot) it on a map, how many do you see?
* 282 trip ids. 
* 282 rows.
* Always plotting shape_id underneath, just the coloring is different.

In [33]:
len(m1)

282

In [34]:
m1.trip_id.nunique()

282

In [35]:
# m1.loc[m1.trip_id == "11136191"]

In [36]:
m1.explore(
    "trip_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    height=400,
    width=600,
    legend=False,
)

#### how many unique route_ids? when you plot route_id on a map, how many do you see?
* Just one for both questions.

In [37]:
m1.route_id.nunique()

1

In [38]:
"""m1.explore(
    "route_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    height=400,
    width=600,
    legend=True,
)"""

'm1.explore(\n    "route_id",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    height=400,\n    width=600,\n    legend=True,\n)'

### Shapes 
* Direction 1: going towards Financial District.
* Direction 0: going towards Presidio.
* The `route_length`  in `route_lines` is the length of the longest geometry: not for that particular segment.

In [39]:
longest_shape = gpd.read_parquet(f"{GCS_DASK_PATH}longest_shape_segments.parquet")

In [40]:
routelines.crs == longest_shape.crs

True

In [41]:
m2 = (m1.drop(columns=["geometry"]).merge(
        longest_shape,
        how="inner",
        on=["calitp_itp_id", "direction_id", "route_id", "route_dir_identifier"],
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [49]:
len(m2), m2.longest_shape_id.nunique(), m2.shape_id.nunique()

(1410, 2, 12)

In [50]:
m2["longest_vs_actual_route"] = m2.actual_route_length / m2.route_length * 100

In [51]:
m3 = m2.set_geometry("geometry_arrowized")

In [52]:
len(m3.drop_duplicates(subset = ['trip_id', 'route_id', 'route_dir_identifier',]))

282

In [53]:
m3 = (m3.drop_duplicates(subset = ['trip_id', 'route_id', 'route_dir_identifier',]))

In [54]:
# Count number of segments in `longest_shape`.
m3_test = (
    m3.groupby(
        [
            "calitp_itp_id",
            "trip_id",
            "route_dir_identifier",
            "shape_id",
            "longest_shape_id",
            "actual_route_length",
            "route_length",
            "longest_vs_actual_route",
        ]
    )
    .agg({"segment_sequence": "nunique"})
    .rename(columns = {'segment_sequence':'total_segments'})
    .reset_index()
)

In [55]:
m3.drop(columns=["geometry", "geometry_arrowized"]).head(4)

Unnamed: 0,calitp_itp_id,shape_id,actual_route_length,trip_id,route_id,direction_id,route_dir_identifier,calitp_url_number,longest_shape_id,route_length,segment_sequence,longest_vs_actual_route
0,282,204812,8141.77,11136194,30,0,3971062270,0,204811,3415.07,0,238.41
4,282,204812,8141.77,11136207,30,0,3971062270,0,204811,3415.07,0,238.41
8,282,204812,8141.77,11136209,30,0,3971062270,0,204811,3415.07,0,238.41
12,282,204812,8141.77,11136205,30,0,3971062270,0,204811,3415.07,0,238.41


In [56]:
m3_test.head(2)

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,shape_id,longest_shape_id,actual_route_length,route_length,longest_vs_actual_route,total_segments
0,282,11136191,3971062270,204811,204811,3415.07,3415.07,100.0,1
1,282,11136192,3971062270,204814,204811,4915.15,3415.07,143.93,1


#### How many unique shape_ids? when you plot shape_id on a map, how many do you see?
* 12 shape ids.
* I can see 2 shape ids on the map (barely).
* However, all 12 are plotted.

In [57]:
m3.shape_id.nunique()

12

In [58]:
"""
m3.explore(
    "shape_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    legend=True,
    width=800,
    height=400,
)"""

'\nm3.explore(\n    "shape_id",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    legend=True,\n    width=800,\n    height=400,\n)'

#### how many unique direction_ids? when you plot direction_id, how many do you see?
* 2 direction ids. 
* Looks a lot like what happens when plotting on `shape_id` column.

In [59]:
m3.direction_id.nunique()

2

In [60]:
"""m3.explore(
    "direction_id",
    cmap="tab10",
    style_kwds={"weight": 6},
    legend=True,
    width=800,
    height=400,
)"""

'm3.explore(\n    "direction_id",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    legend=True,\n    width=800,\n    height=400,\n)'

In [61]:
"""
m3.explore(
    "segment_sequence",
    cmap="tab10",
    style_kwds={"weight": 6},
    legend=False,
    width=800,
    height=400,
)"""

'\nm3.explore(\n    "segment_sequence",\n    cmap="tab10",\n    style_kwds={"weight": 6},\n    legend=False,\n    width=800,\n    height=400,\n)'

In [62]:
# m3.loc[m3.direction_id == "0"].explore("segment_sequence", cmap = 'tab10', style_kwds = {'weight':6}, legend = False, width = 800, height = 400)

In [63]:
# m3.loc[m3.direction_id == "1"].explore("segment_sequence", cmap = 'tab20', style_kwds = {'weight':6}, legend = False, width = 800, height = 400)

In [64]:
def find_operator_info(df):
    df = df.sort_values(["calitp_itp_id", "trip_id", "segment_sequence"])
    df = df.drop_duplicates(subset = ['trip_id', 'route_dir_identifier', 'segment_sequence',]).reset_index(drop = True)
    merge_cols = [
        "calitp_itp_id",
        "trip_id",
        "route_dir_identifier",
    ]

    # Get start time.
    start_time_trip = (
        df.groupby(merge_cols)
        .agg({"vehicle_timestamp": "min"})
        .rename(columns={"vehicle_timestamp": "start"})
        .reset_index()
    )

    # Get end time.
    end_time_trip = (
        df.groupby(merge_cols)
        .agg({"vehicle_timestamp": "max"})
        .rename(columns={"vehicle_timestamp": "end"})
        .reset_index()
    )

    # Count number of segments.
    segment_counts = (
        df.groupby(merge_cols)
        .agg({"segment_sequence": "nunique"})
        .reset_index()
        .rename(columns={"segment_sequence": "number_of_segments"})
    )

    # Merge
    m1 = start_time_trip.merge(end_time_trip, how="inner", on=merge_cols).merge(
        segment_counts, how="left", on=merge_cols
    )

    # Calculate time elapsed
    # https://stackoverflow.com/questions/51491724/calculate-difference-of-2-dates-in-minutes-in-pandas
    m1["minutes_elapsed"] = (m1.end - m1.start).dt.total_seconds() / 60

    return m1

In [65]:
operator = pd.read_parquet(
    f"{GCS_DASK_PATH}vp_sjoin/vp_segment_{agency}_{analysis_date}.parquet"
)

In [66]:
operator_info = find_operator_info(operator)

In [67]:
len(operator_info)

8518

In [68]:
operator_info.head()

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,start,end,number_of_segments,minutes_elapsed
0,282,11108746,4013410901,2022-10-12 15:05:36,2022-10-12 15:44:44,9,39.13
1,282,11108747,4013410901,2022-10-12 14:57:24,2022-10-12 15:37:58,9,40.57
2,282,11108748,4013410901,2022-10-12 14:49:23,2022-10-12 15:30:56,10,41.55
3,282,11108749,4013410901,2022-10-12 14:45:02,2022-10-12 15:27:01,10,41.98
4,282,11108750,4013410901,2022-10-12 14:36:35,2022-10-12 15:20:39,9,44.07


In [69]:
# Merge 
m4 = operator_info[
    [
        "calitp_itp_id",
        "trip_id",
        "route_dir_identifier",
        "number_of_segments",
        "minutes_elapsed",
    ]
].merge(
    m3_test,
    how="inner",
    on=["calitp_itp_id", "trip_id", "route_dir_identifier"],
)

In [70]:
# Find the total number of segments in the specific operator file
# vs. what was recorded in `longest_shape`
m4["segment_proportion"] = ((m4.number_of_segments / m4.total_segments) * 100).astype(
    "int64"
)

In [71]:
m4.sort_values('trip_id').sample(5)

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,number_of_segments,minutes_elapsed,shape_id,longest_shape_id,actual_route_length,route_length,longest_vs_actual_route,total_segments,segment_proportion
102,282,11136294,3971062270,4,22.08,204814,204811,4915.15,3415.07,143.93,1,400
170,282,11136363,2612439400,6,29.35,204822,204817,8694.37,5261.14,165.26,1,600
214,282,11136407,2612439400,4,16.3,204821,204817,4683.26,5261.14,89.02,1,400
195,282,11136388,2612439400,4,21.85,204823,204817,3332.92,5261.14,63.35,1,400
163,282,11136356,2612439400,6,32.37,204822,204817,8694.37,5261.14,165.26,1,600


#### Normal for trip_ids to be dropped? 

In [72]:
m4.shape

(279, 12)

In [73]:
m4.trip_id.nunique(), m3_test.trip_id.nunique(), m3.trip_id.nunique()

(279, 282, 282)

In [74]:
m4.shape_id.nunique(), m4.longest_shape_id.nunique()

(12, 2)

In [75]:
m4.longest_shape_id.nunique(), m4.shape_id.nunique()

(2, 12)