## Route ID 172, Operator 127 Test
* Santa Rosa - San Francisco

In [1]:
import geopandas as gpd
import pandas as pd
from calitp.sql import to_snakecase
from shared_utils import geography_utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_DASK_PATH = "gs://calitp-analytics-data/data-analyses/dask_test/"
GCS_RT_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/"

In [4]:
analysis_date = "2022-10-12"

In [5]:
agency = 127

In [6]:
route_id = "172"

In [7]:
# Shapes for Route 172
shape_ids_for_route_172 = ['1720021', '1720025']

### Route Map

In [8]:
# Tells me actual route length for each shape id.
routelines = gpd.read_parquet(
    f"{GCS_RT_PATH}compiled_cached_views/routelines_{analysis_date}.parquet"
)

In [9]:
# Filter for specific route & agency
routelines_172 =  (
    routelines[
    ((routelines.calitp_itp_id == agency) & (routelines.shape_id.isin(shape_ids_for_route_172)))]
    .drop(columns=["calitp_url_number"])
    .reset_index(drop = True)
    .drop_duplicates()
)

In [10]:
# Calculate length of geometry
routelines_172 = routelines_172.assign(
    actual_route_length=(
        routelines_172.geometry.to_crs(geography_utils.CA_NAD83Albers).length
    )
)

In [11]:
len(routelines_172)

2

In [12]:
routelines_172.explore(cmap='tab10', style_kwds = {'weight':6}, height=400, width = 600, legend = True)

### Trip Map

In [13]:
#trips = pd.read_parquet(
#   f"{GCS_RT_PATH}compiled_cached_views/trips_{analysis_date}.parquet"
#) 

In [14]:
trips = pd.read_parquet(
   f"{GCS_RT_PATH}compiled_cached_views/trips_{analysis_date}.parquet"
)[['calitp_itp_id','direction_id','trip_key','trip_id','route_id','shape_id', 'route_long_name']]

In [15]:
# Find only rows for route 172
trips_172 =  trips[
    ((trips.calitp_itp_id == agency) & (trips.route_id == route_id))
].reset_index(drop = True)

In [16]:
# trips_172 = trips[['calitp_itp_id','direction_id','trip_key','trip_id','route_id','shape_id', 'route_long_name']]

In [17]:
trips_172.shape

(8, 7)

In [18]:
trips_172

Unnamed: 0,calitp_itp_id,direction_id,trip_key,trip_id,route_id,shape_id,route_long_name
0,127,0,4521974190285428778,7992973,172,1720021,Santa Rosa - San Francisco
1,127,0,5714658137335035394,7992977,172,1720021,Santa Rosa - San Francisco
2,127,0,-640579009830274701,7992975,172,1720021,Santa Rosa - San Francisco
3,127,1,-7829901120040421166,7992981,172,1720025,Santa Rosa - San Francisco
4,127,1,-6625595916336578425,7992978,172,1720025,Santa Rosa - San Francisco
5,127,0,-2485208027597198257,7992972,172,1720021,Santa Rosa - San Francisco
6,127,1,-4659794943976976419,7992983,172,1720025,Santa Rosa - San Francisco
7,127,1,-177721503255462243,7992979,172,1720025,Santa Rosa - San Francisco


In [19]:
crosswalk = pd.read_parquet(
    f"{GCS_DASK_PATH}segments_route_direction_crosswalk.parquet"
)

In [20]:
# Join to crosswalk before joining routlines to grab geometry
trips_m_crosswalk = trips_172.merge(
    crosswalk, how="inner", on=["calitp_itp_id", "route_id", "direction_id"]
)

In [21]:
m1 = routelines_172.merge(
    trips_m_crosswalk,
    how="inner",
    on=["calitp_itp_id","shape_id"],
)

In [22]:
m1.drop(columns = ["geometry"]).sort_values('trip_id')

Unnamed: 0,calitp_itp_id,shape_id,actual_route_length,direction_id,trip_key,trip_id,route_id,route_long_name,route_dir_identifier
7,127,1720021,100298.8,0,-2485208027597198257,7992972,172,Santa Rosa - San Francisco,1808907172
4,127,1720021,100298.8,0,4521974190285428778,7992973,172,Santa Rosa - San Francisco,1808907172
6,127,1720021,100298.8,0,-640579009830274701,7992975,172,Santa Rosa - San Francisco,1808907172
5,127,1720021,100298.8,0,5714658137335035394,7992977,172,Santa Rosa - San Francisco,1808907172
1,127,1720025,101102.72,1,-6625595916336578425,7992978,172,Santa Rosa - San Francisco,483822386
3,127,1720025,101102.72,1,-177721503255462243,7992979,172,Santa Rosa - San Francisco,483822386
0,127,1720025,101102.72,1,-7829901120040421166,7992981,172,Santa Rosa - San Francisco,483822386
2,127,1720025,101102.72,1,-4659794943976976419,7992983,172,Santa Rosa - San Francisco,483822386


In [23]:
m1.explore("trip_id",cmap = 'tab10', style_kwds = {'weight':6}, height=400, width = 600, legend  = True)

In [24]:
# m1.loc[m1.route_dir_identifier == 483822386].explore("trip_id",cmap = 'tab10', style_kwds = {'weight':6}, height=400, width = 600, legend  = True)

In [25]:
# m1.loc[m1.route_dir_identifier == 1808907172].explore("trip_id",cmap = 'tab10', style_kwds = {'weight':6}, height=400, width = 600, legend  = True)

### Shapes 
* Direction 1: going to San Francisco
* Direction 0: going towards Santa Rosa

In [26]:
longest_shape = gpd.read_parquet(f"{GCS_DASK_PATH}longest_shape_segments.parquet")

In [27]:
len(longest_shape), len(m1)

(126896, 8)

In [28]:
m2 = longest_shape.merge(
    m1.drop(columns = ["geometry"]),
    how="inner",
    on=["calitp_itp_id", "direction_id", "route_id", "route_dir_identifier"],
).drop_duplicates().reset_index(drop = True)

In [29]:
m2["longest_vs_actual_route"] = m2.actual_route_length/m2.route_length * 100

In [30]:
len(m2)

812

In [31]:
m3 = m2.set_geometry("geometry_arrowized")

In [32]:
f"There are {m2.shape_id.nunique()} shape_ids"

'There are 2 shape_ids'

In [33]:
f"There are {m2.trip_id.nunique()} trips"

'There are 8 trips'

In [34]:
m3.drop(columns = ['geometry','geometry_arrowized']).head()

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,direction_id,longest_shape_id,route_dir_identifier,route_length,segment_sequence,shape_id,actual_route_length,trip_key,trip_id,route_long_name,longest_vs_actual_route
0,127,1,172,0,1720021,1808907172,100298.8,0,1720021,100298.8,4521974190285428778,7992973,Santa Rosa - San Francisco,100.0
1,127,1,172,0,1720021,1808907172,100298.8,0,1720021,100298.8,5714658137335035394,7992977,Santa Rosa - San Francisco,100.0
2,127,1,172,0,1720021,1808907172,100298.8,0,1720021,100298.8,-640579009830274701,7992975,Santa Rosa - San Francisco,100.0
3,127,1,172,0,1720021,1808907172,100298.8,0,1720021,100298.8,-2485208027597198257,7992972,Santa Rosa - San Francisco,100.0
4,127,1,172,0,1720021,1808907172,100298.8,1,1720021,100298.8,4521974190285428778,7992973,Santa Rosa - San Francisco,100.0


In [35]:
m3.crs = m2.crs

In [36]:
m3.explore("segment_sequence", cmap = 'tab10', style_kwds = {'weight':6}, legend = False, width = 800, height = 400)

In [37]:
# m3.loc[m3.direction_id == "0"].explore("segment_sequence", cmap = 'tab10', style_kwds = {'weight':6}, legend = False, width = 800, height = 400)

In [38]:
# m3.loc[m3.direction_id == "1"].explore("segment_sequence", cmap = 'tab20', style_kwds = {'weight':6}, legend = False, width = 800, height = 400)

In [39]:
def find_operator_info(df):
    df = df.sort_values(["calitp_itp_id", "trip_id", "segment_sequence"])

    merge_cols = [
        "calitp_itp_id",
        "trip_id",
        "route_dir_identifier",
    ]

    # Get start time.
    start_time_trip = (
        df.groupby(merge_cols)
        .agg({"vehicle_timestamp": "min"})
        .rename(columns={"vehicle_timestamp": "start"})
        .reset_index()
    )

    # Get end time.
    end_time_trip = (
        df.groupby(merge_cols)
        .agg({"vehicle_timestamp": "max"})
        .rename(columns={"vehicle_timestamp": "end"})
        .reset_index()
    )

    # Count number of segments.
    segment_counts = (
        df.groupby(merge_cols)
        .agg({"segment_sequence": "nunique"})
        .reset_index()
        .rename(columns={"segment_sequence": "number_of_segments"})
    )

    # Merge
    m1 = start_time_trip.merge(end_time_trip, how="inner", on=merge_cols).merge(
        segment_counts, how="left", on=merge_cols
    )

    # Calculate time elapsed
    # https://stackoverflow.com/questions/51491724/calculate-difference-of-2-dates-in-minutes-in-pandas
    m1["minutes_elapsed"] = (m1.end - m1.start).dt.total_seconds() / 60

    return m1

In [40]:
# Use pandas.read_parquet/read_feather() instead.
operator = pd.read_parquet(
    f"{GCS_DASK_PATH}vp_sjoin/vp_segment_{agency}_{analysis_date}.parquet"
)

In [41]:
operator_info = find_operator_info(operator)

In [42]:
operator_info.head()

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,start,end,number_of_segments,minutes_elapsed
0,127,7992972,1808907172,2022-10-12 14:32:26,2022-10-12 17:31:57,98,179.52
1,127,7992973,1808907172,2022-10-12 15:24:56,2022-10-12 18:10:07,101,165.18
2,127,7992975,1808907172,2022-10-12 15:55:36,2022-10-12 19:01:48,100,186.2
3,127,7992977,1808907172,2022-10-12 15:18:56,2022-10-12 19:41:38,97,262.7
4,127,7992978,483822386,2022-10-12 04:09:42,2022-10-12 06:00:03,100,110.35


In [43]:
m3_test = m3.groupby(["calitp_itp_id", "trip_id","route_dir_identifier", "route_length", "shape_id", "longest_shape_id", "actual_route_length"]).agg({'segment_sequence':'count'}).reset_index()

In [44]:
m3_test.head()

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,route_length,shape_id,longest_shape_id,actual_route_length,segment_sequence
0,127,7992972,1808907172,100298.8,1720021,1720021,100298.8,101
1,127,7992973,1808907172,100298.8,1720021,1720021,100298.8,101
2,127,7992975,1808907172,100298.8,1720021,1720021,100298.8,101
3,127,7992977,1808907172,100298.8,1720021,1720021,100298.8,101
4,127,7992978,483822386,101102.72,1720025,1720025,101102.72,102


In [45]:
m4 = operator_info[
    [
        "calitp_itp_id",
        "trip_id",
        "route_dir_identifier",
        "number_of_segments",
        "minutes_elapsed",
    ]
].merge(
    m3_test,
    how="inner",
    on=["calitp_itp_id", "trip_id","route_dir_identifier"],
)

In [46]:
m4.shape

(8, 10)

In [47]:
# Find the total number of segments in the specific operator file
# vs. what was recorded in `longest_shape`
m4["segment_proportion"] = ((m4.number_of_segments / m4.segment_sequence) * 100).astype(
    "int64"
)

In [48]:
m4["actual_vs_longest_route_length"] = m4.route_length/m4.actual_route_length * 100 

In [49]:
m4

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,number_of_segments,minutes_elapsed,route_length,shape_id,longest_shape_id,actual_route_length,segment_sequence,segment_proportion,actual_vs_longest_route_length
0,127,7992972,1808907172,98,179.52,100298.8,1720021,1720021,100298.8,101,97,100.0
1,127,7992973,1808907172,101,165.18,100298.8,1720021,1720021,100298.8,101,100,100.0
2,127,7992975,1808907172,100,186.2,100298.8,1720021,1720021,100298.8,101,99,100.0
3,127,7992977,1808907172,97,262.7,100298.8,1720021,1720021,100298.8,101,96,100.0
4,127,7992978,483822386,100,110.35,101102.72,1720025,1720025,101102.72,102,98,100.0
5,127,7992979,483822386,100,151.02,101102.72,1720025,1720025,101102.72,102,98,100.0
6,127,7992981,483822386,99,98.17,101102.72,1720025,1720025,101102.72,102,97,100.0
7,127,7992983,483822386,100,153.85,101102.72,1720025,1720025,101102.72,102,98,100.0
