## Switch Direction 0/1 to Cardinal
* direction_id -> cardinal direction. i want to use stop_times_with_direction and just count, by route-id/direction-id how many stops in each stop_primary_direction group. then whatever is pluraltiy, assign that.
* you can take a look at the gtfs_analytics_catalog and find that file and just open it. the script to create it is in gtfs_funnel already, and you can see what that column means.
* https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/stop_times_with_direction.py


In [1]:
from datetime import datetime

import _section2_utils as section2_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name = "Los Angeles County Metropolitan Transportation Authority"

### Why are there so many unknowns in the Cardinal Direction?
* Because I need to clean up route names using [this script](https://github.com/cal-itp/data-analyses/blob/b1e5d4f870400251240eeba4a6515a0848e5d6f8/gtfs_funnel/clean_route_naming.py#L4).

In [4]:
sched_vp_df = section2_utils.load_schedule_vp_metrics(org_name)

In [5]:
sched_vp_df["Cardinal Direction"].value_counts()

Unknown       9372
Westbound      132
Eastbound      131
Northbound      66
Southbound      66
Name: Cardinal Direction, dtype: int64

In [6]:
gtfs_keys = list(sched_vp_df.schedule_gtfs_dataset_key.unique())

In [7]:
date = section2_utils.load_most_current_date()

In [8]:
date

'2024-04-17'

In [9]:
all_dates_list = list(sched_vp_df.Date.unique())

In [10]:
all_dates_list = [np.datetime_as_string(date, unit="D") for date in all_dates_list]

In [11]:
all_dates_list

['2023-03-15',
 '2023-07-12',
 '2023-08-15',
 '2023-09-13',
 '2023-10-11',
 '2023-11-15',
 '2023-12-13',
 '2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2023-04-12',
 '2023-05-17',
 '2023-06-14']

### Use more than one `schedule_gtfs_dataset_key` & date

In [12]:
def merge_scheduled_stop_times(date: str, gtfs_schedule_key: list) -> pd.DataFrame:
    stop_times_col = [
        "feed_key",
        "stop_id",
        "stop_sequence",
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "shape_array_key",
        "stop_name",
        "prior_stop_sequence",
        "subseq_stop_sequence",
        "stop_pair",
        "stop_pair_name",
        "stop_primary_direction",
        "stop_meters",
    ]
    stop_times_df = helpers.import_scheduled_stop_times(
        date,
        filters=[[("schedule_gtfs_dataset_key", "in", gtfs_schedule_key)]],
        columns=stop_times_col,
        get_pandas=True,
        with_direction=True,
    )

    scheduled_trips_df = helpers.import_scheduled_trips(
        date,
        filters=[[("gtfs_dataset_key", "in", gtfs_schedule_key)]],
        columns=[
            "route_id",
            "trip_instance_key",
            "gtfs_dataset_key",
            "shape_array_key",
            "direction_id",
        ],
    )

    m1 = pd.merge(
        scheduled_trips_df,
        stop_times_df,
        on=["trip_instance_key", "schedule_gtfs_dataset_key", "shape_array_key"],
        how="inner",
    )

    return m1

In [13]:
def stack_scheduled_stop(date_list: list, gtfs_schedule_key: list) -> pd.DataFrame:
    full_df = pd.DataFrame()
    for i in date_list:
        df = merge_scheduled_stop_times(i, gtfs_schedule_key)
        df["date"] = i
        full_df = pd.concat([full_df, df], axis=0)
    return full_df

In [14]:
all_dates = stack_scheduled_stop(all_dates_list, gtfs_keys)

In [15]:
all_dates.shape

(12325584, 16)

In [16]:
all_dates.head(2)

Unnamed: 0,route_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,direction_id,feed_key,stop_id,stop_sequence,stop_name,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters,date
0,2-13167,00e0690afc5d3fbcca6d6161dc3792e1,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,7eae459a245a6741e5c4c09c67598613,0.0,1dce186c157f55ed353f9bd8bf6f43b6,11426,1,Le Conte / Broxton,,2,11426__2939,Le Conte / Broxton__Le Conte / Westwood,Unknown,,2023-03-15
1,2-13167,00e0690afc5d3fbcca6d6161dc3792e1,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,7eae459a245a6741e5c4c09c67598613,0.0,1dce186c157f55ed353f9bd8bf6f43b6,2939,2,Le Conte / Westwood,1.0,3,2939__2938,Le Conte / Westwood__Le Conte / Tiverton,Eastbound,159.45,2023-03-15


### Aggregate
* DO we want to aggregate by date? 

In [17]:
# Count total stops
agg1 = (
    all_dates.groupby(
        [   
            "route_id",
            "schedule_gtfs_dataset_key",
            "direction_id",
            "stop_primary_direction",
        ]
    )
    .agg({"stop_sequence": "count"})
    .reset_index()
)

In [20]:
agg1.shape

(3043, 5)

In [22]:
# Sort and drop duplicates so that the
# largest # of stops by stop_primary_direction is kept
agg2 = agg1.sort_values(
    by=["route_id", "schedule_gtfs_dataset_key", "direction_id", "stop_sequence"],
    ascending=[True, True, True, False],
)

In [23]:
cardinal_dir_df = agg2.drop_duplicates(
    subset=["route_id", "schedule_gtfs_dataset_key", "direction_id",]
).reset_index(drop=True)

In [24]:
cardinal_dir_df.shape

(692, 5)

In [25]:
routes1 = set(sched_vp_df["Route ID"].unique().tolist())
routes2 = set(cardinal_dir_df.route_id.unique().tolist())

In [26]:
len(routes1)

120

In [27]:
len(routes1 - routes2)

113

In [28]:
len(routes2 - routes1)

339

### Understanding `/gtfs_funnel/clean_route_naming.py`
* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/clean_route_naming.py

In [29]:
import sys

sys.path.append("../gtfs_funnel")
import clean_route_naming

#### Breaking out `concatenate_routes_across_dates`
* This would replace the `helpers.import_scheduled_trips` in my `merge_scheduled_stop_times` function.

In [30]:
gtfs_funnel_scheduled_trips = pd.concat(
    [
        helpers.import_scheduled_trips(
            analysis_date,
            filters=[[("gtfs_dataset_key", "in", gtfs_keys)]],
            columns=[
                "route_id",
                "trip_instance_key",
                "gtfs_dataset_key",
                "shape_array_key",
                "direction_id",
                "route_long_name",
                "route_short_name",
                "route_desc",
                "name",
            ],
            get_pandas=True,
        ).assign(service_date=pd.to_datetime(analysis_date))
        for analysis_date in all_dates_list
    ],
    axis=0,
    ignore_index=True,
)

In [31]:
gtfs_funnel_scheduled_trips = gtfs_funnel_scheduled_trips.assign(
    route_id=gtfs_funnel_scheduled_trips.route_id.fillna(""),
    route_short_name=gtfs_funnel_scheduled_trips.route_short_name.fillna(""),
    route_long_name=gtfs_funnel_scheduled_trips.route_long_name.fillna(""),
)

In [32]:
gtfs_funnel_scheduled_trips = gtfs_funnel_scheduled_trips.assign(
    combined_name=gtfs_funnel_scheduled_trips.route_short_name
    + "__"
    + gtfs_funnel_scheduled_trips.route_long_name
)

In [33]:
gtfs_schedule_wrangling.standardize_route_id??

[0;31mSignature:[0m
[0mgtfs_schedule_wrangling[0m[0;34m.[0m[0mstandardize_route_id[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mrow[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgtfs_name_col[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mroute_col[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mstandardize_route_id[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mrow[0m[0;34m,[0m [0;34m[0m
[0;34m[0m    [0mgtfs_name_col[0m[0;34m:[0m [0mstr[0m[0;34m,[0m [0;34m[0m
[0;34m[0m    [0mroute_col[0m[0;34m:[0m [0mstr[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    Standardize route_id across time. [0m
[0;34m    For certain operators, we can parse away the suffix after an[0m
[0;34m    hyphen or underscore.[0m
[0;34m    Must include a column t

In [34]:
gtfs_funnel_scheduled_trips = gtfs_funnel_scheduled_trips.assign(
    route_id2=gtfs_funnel_scheduled_trips.apply(
        lambda x: gtfs_schedule_wrangling.standardize_route_id(x, "name", "route_id"),
        axis=1,
    )
)

In [35]:
gtfs_funnel_scheduled_trips.head(2)

Unnamed: 0,route_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,direction_id,route_long_name,route_short_name,route_desc,name,service_date,combined_name,route_id2
0,2-13167,00e0690afc5d3fbcca6d6161dc3792e1,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,7eae459a245a6741e5c4c09c67598613,0.0,Metro Local Line,2,WESTWOOD - EXPOSITION PARK VIA SUNSET-ALVARADO,LA Metro Bus Schedule,2023-03-15,2__Metro Local Line,2
1,2-13167,6f138c4f993587f689182dfd7a17c0f0,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,7eae459a245a6741e5c4c09c67598613,0.0,Metro Local Line,2,WESTWOOD - EXPOSITION PARK VIA SUNSET-ALVARADO,LA Metro Bus Schedule,2023-03-15,2__Metro Local Line,2


#### `gtfs_schedule_wrangling.most_recent_route_info()`

In [36]:
gtfs_schedule_wrangling.most_recent_route_info??

[0;31mSignature:[0m
[0mgtfs_schedule_wrangling[0m[0;34m.[0m[0mmost_recent_route_info[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_cols[0m[0;34m:[0m [0mlist[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mroute_col[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mmost_recent_route_info[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m:[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_cols[0m[0;34m:[0m [0mlist[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mroute_col[0m[0;34m:[0m [0mstr[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mpd[0m[0;34m.[0m[0mDa

In [37]:
route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]

In [38]:
sort_order = [True for c in  route_cols]

In [39]:
most_recent = (gtfs_funnel_scheduled_trips.sort_values(route_cols + ["service_date"], 
                                  ascending = sort_order + [False])
                   .drop_duplicates(subset = route_cols)  
                   .rename(columns = {"combined_name": "recent_combined_name"})
                  )

In [40]:
most_recent.head(2)

Unnamed: 0,route_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,direction_id,route_long_name,route_short_name,route_desc,name,service_date,recent_combined_name,route_id2
166027,801,06c25013b54dacd56e7f3b5f2d22612c,2a0571758141f412b6a546fd70a65bf3,30b0857c1b7dd7db8665329f159ecbca,0.0,Metro A Line,,,LA Metro Rail Schedule,2024-04-17,__Metro A Line,801
165229,802,96b585442d46d500c8a637b45009d878,2a0571758141f412b6a546fd70a65bf3,6d9d1687f4fac7c36bae295e58d66d8f,0.0,Metro B Line,,,LA Metro Rail Schedule,2024-04-17,__Metro B Line,802


In [41]:
most_recent.shape, most_recent.recent_combined_name.nunique()

((121, 12), 121)

In [42]:
most_recent2 = (most_recent.sort_values(["schedule_gtfs_dataset_key", "name", 
                      "recent_combined_name"] + ["service_date"], 
                                  ascending = sort_order + [False])
                   .drop_duplicates(subset = route_cols)  
                   .rename(columns = {"route_id2": "recent_route_id2"})
                  )

In [43]:
most_recent2.shape, most_recent2.recent_route_id2.nunique()

((121, 12), 121)

### Test Merge w/ `sched_vp_df` w/ `standardized_route_ids.parquet`
* DO I use `recent_route_id2` since it's the last col created?

In [44]:
std_route_names = (
    f"{SCHED_GCS}{GTFS_DATA_DICT.schedule_tables.route_identification}.parquet"
)

In [45]:
#std_route_names_df = pd.read_parquet(std_route_names)

In [46]:
#std_route_names_df = std_route_names_df.loc[
#    std_route_names_df.schedule_gtfs_dataset_key.isin(gtfs_keys)
#]

In [47]:
# std_route_names_df.shape, std_route_names_df.route_id2.nunique(), std_route_names_df.recent_combined_name.nunique()

In [48]:
len(cardinal_dir_df)

692

In [49]:
# Delete out cols
sched_vp_df = sched_vp_df.drop(columns=["Cardinal Direction"])

#### Why does this explode the dataframe to be 3x larger?

In [50]:
sched_vp_df['Route ID'].unique(), sched_vp_df['Route ID'].nunique()

(array(['801', '802', '803', '804', '805', '806', '807', '10', '102',
        '105', '106', '108', '110', '111', '115', '117', '120', '125',
        '127', '128', '14', '150', '152', '154', '155', '158', '16', '161',
        '162', '164', '165', '166', '167', '169', '177', '179', '18',
        '180', '182', '2', '20', '202', '204', '205', '206', '207', '209',
        '210', '211', '212', '217', '218', '222', '224', '230', '232',
        '233', '234', '236', '237', '240', '242', '244', '246', '251',
        '256', '258', '260', '265', '266', '267', '268', '28', '287',
        '294', '30', '33', '344', '35', '4', '40', '45', '460', '487',
        '501', '51', '53', '55', '550', '577', '60', '601', '602', '603',
        '605', '611', '617', '62', '66', '660', '662', '665', '686', '690',
        '70', '720', '754', '76', '761', '78', '81', '854', '857', '90',
        '901', '910', '92', '94', '96', 'DSE'], dtype=object),
 120)

In [51]:
most_recent2.recent_route_id2.unique(), most_recent2['recent_route_id2'].nunique()

(array(['801', '802', '803', '805', '804', '806', '807', '10', '102',
        '105', '106', '108', '110', '111', '115', '117', '120', '125',
        '127', '128', '134', '14', '150', '152', '154', '155', '158',
        '161', '162', '164', '165', '166', '167', '169', '16', '177',
        '179', '180', '182', '18', '202', '204', '205', '206', '207',
        '209', '20', '210', '211', '212', '217', '218', '222', '224',
        '230', '232', '233', '234', '236', '237', '240', '242', '244',
        '246', '251', '256', '258', '260', '265', '266', '267', '268',
        '287', '28', '294', '2', '30', '33', '344', '35', '40', '45',
        '460', '487', '4', '501', '51', '53', '550', '55', '577', '601',
        '602', '603', '605', '60', '611', '617', '62', '660', '662', '665',
        '66', '686', '690', '70', '720', '754', '761', '76', '78', '81',
        '857', '90', '92', '94', '96', 'DSE', '901', '910', '854'],
       dtype=object),
 121)

#### Seeing what are the best columns to match over for names.

In [52]:
routes3 = set(most_recent2.recent_route_id2.unique().tolist())

In [53]:
most_recent2.columns

Index(['route_id', 'trip_instance_key', 'schedule_gtfs_dataset_key',
       'shape_array_key', 'direction_id', 'route_long_name',
       'route_short_name', 'route_desc', 'name', 'service_date',
       'recent_combined_name', 'recent_route_id2'],
      dtype='object')

In [54]:
route_names1 = set(most_recent2.route_long_name.unique().tolist())

In [55]:
route_names2 = set(sched_vp_df.route_long_name.unique().tolist())

In [56]:
route_names2 - route_names1

{'C & K Line Link (857)',
 'Metro A Line (Blue)',
 'Metro A-Line',
 'Metro B Line (Red)',
 'Metro B-Line',
 'Metro C Line (Green)',
 'Metro C-Line',
 'Metro D Line (Purple)',
 'Metro D-Line',
 'Metro E-Line',
 'Metro G Line (Orange) 901',
 'Metro G-Line 901',
 'Metro J Line (Silver) 910/950',
 'Metro J-Line 910/950',
 'Metro K Line (Crenshaw)',
 'Metro K-Line',
 'Metro L Line (Gold)'}

In [57]:
route_names1 - route_names2

set()

In [58]:
len(routes3)

121

In [59]:
routes4 = set(most_recent2.route_id.unique().tolist())

In [60]:
len(routes1), len(routes3)

(120, 121)

In [61]:
routes3-routes1

{'134'}

In [62]:
routes1-routes3

set()

In [63]:
pd.merge(
    sched_vp_df,
    most_recent2,
    left_on=["schedule_gtfs_dataset_key", "Direction","Route ID"],
    right_on=["schedule_gtfs_dataset_key", "direction_id","recent_route_id2"],
    how="left",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
both          4884
left_only     4883
right_only       0
dtype: int64

In [70]:
pd.merge(
most_recent2,
cardinal_dir_df,
left_on = "recent_route_id2",
right_on = "route_id",
how = "left",
indicator = True)[["_merge"]].value_counts()

_merge    
left_only     114
both           14
right_only      0
dtype: int64

In [75]:
most_recent2.head(1)

Unnamed: 0,route_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,direction_id,route_long_name,route_short_name,route_desc,name,service_date,recent_combined_name,recent_route_id2
166027,801,06c25013b54dacd56e7f3b5f2d22612c,2a0571758141f412b6a546fd70a65bf3,30b0857c1b7dd7db8665329f159ecbca,0.0,Metro A Line,,,LA Metro Rail Schedule,2024-04-17,__Metro A Line,801


In [79]:
most_recent2.shape, most_recent2.route_id.nunique()

((121, 12), 121)

In [80]:
pd.merge(
most_recent2,
cardinal_dir_df,
on = ["route_id"],
how = "outer",
indicator = True)[["_merge"]].value_counts()

_merge    
right_only    450
both          242
left_only       0
dtype: int64

In [84]:
cardinal_dir_df2 = pd.merge(
most_recent2.drop(columns = ['direction_id']),
cardinal_dir_df,
on = ["schedule_gtfs_dataset_key", "route_id"],
how = "inner")

In [86]:
cardinal_dir_df2.head(2)

Unnamed: 0,route_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,route_long_name,route_short_name,route_desc,name,service_date,recent_combined_name,recent_route_id2,direction_id,stop_primary_direction,stop_sequence
0,801,06c25013b54dacd56e7f3b5f2d22612c,2a0571758141f412b6a546fd70a65bf3,30b0857c1b7dd7db8665329f159ecbca,Metro A Line,,,LA Metro Rail Schedule,2024-04-17,__Metro A Line,801,0.0,Northbound,29597
1,801,06c25013b54dacd56e7f3b5f2d22612c,2a0571758141f412b6a546fd70a65bf3,30b0857c1b7dd7db8665329f159ecbca,Metro A Line,,,LA Metro Rail Schedule,2024-04-17,__Metro A Line,801,1.0,Southbound,30160


In [87]:
cardinal_dir_df2.shape, cardinal_dir_df2.route_id.nunique()

((242, 14), 121)

In [88]:
pd.merge(
    sched_vp_df,
    cardinal_dir_df2,
    left_on=["schedule_gtfs_dataset_key", "Direction","Route ID", "route_long_name"],
    right_on=["schedule_gtfs_dataset_key", "direction_id","recent_route_id2", "route_long_name"],
    how="left",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
both          9384
left_only      383
right_only       0
dtype: int64

In [90]:
len(sched_vp_df)

9767

In [93]:
m1 = pd.merge(
    sched_vp_df,
    cardinal_dir_df2,
    left_on=["schedule_gtfs_dataset_key", "Direction","Route ID", "route_long_name"],
    right_on=["schedule_gtfs_dataset_key", "direction_id","recent_route_id2", "route_long_name"],
    how="left",
)

In [94]:
m1.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,Direction,Period,Average Scheduled Service (trip minutes),Average Stop Distance (miles),# scheduled trips,Trips per Hour,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,Date,Route typology,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,Aggregate Actual Service Minutes,Aggregate Scheduled Service Minutes (all trips),# VP,# VP within Scheduled Shape,# Early Arrival Trips,# On-Time Trips,# Late Trips,# Trips with VP,Average VP per Minute,% VP within Scheduled Shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Average Actual Service (Trip Minutes),GTFS Availability,Speed (MPH),Transit Operator,route_long_name,route_short_name_x,Route,Route ID,schedule_source_record_id,Base64 Encoded Feed URL,Organization ID,Organization,District,ruler_100_pct,ruler_for_vp_per_min,frequency_in_minutes,route_id,trip_instance_key,shape_array_key,route_short_name_y,route_desc,name,service_date,recent_combined_name,recent_route_id2,direction_id,stop_primary_direction,stop_sequence
0,2a0571758141f412b6a546fd70a65bf3,0.0,all_day,53.2,0.98,97,4.04,0.0,0.0,1.0,0.0,1.0,0.0,2023-03-15,rail,5824,4265,8434.44,5160.0,12876,11940,0,4,93,97,1.53,93.0,69.0,51.0,100.0,83.0,1.63,86.95,schedule_and_vp,22.39,LA Metro Rail Schedule,Metro A Line (Blue),,Metro A Line,801,recofCmylEKq2zuPr,aHR0cHM6Ly9naXRsYWIuY29tL0xBQ01UQS9ndGZzX3JhaWwvcmF3L21hc3Rlci9ndGZzX3JhaWwuemlw,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation Authority,07 - Los Angeles,100,2,14.85,,,,,,,NaT,,,,,
1,2a0571758141f412b6a546fd70a65bf3,0.0,all_day,115.71,0.98,94,3.92,0.0,0.0,1.0,0.0,1.0,0.0,2023-07-12,rail,4388,3123,11879.46,5149.0,9413,9370,11,1,33,45,0.79,100.0,37.0,26.0,85.0,61.0,2.31,263.99,schedule_and_vp,23.16,LA Metro Rail Schedule,Metro A-Line,,Metro A Line,801,recofCmylEKq2zuPr,aHR0cHM6Ly9naXRsYWIuY29tL0xBQ01UQS9ndGZzX3JhaWwvcmF3L21hc3Rlci9ndGZzX3JhaWwuemlw,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation Authority,07 - Los Angeles,100,2,15.31,,,,,,,NaT,,,,,


In [96]:
m1.stop_primary_direction = m1.stop_primary_direction.fillna("Unknown")

In [97]:
m1.stop_primary_direction.value_counts()

Northbound    2515
Southbound    2429
Eastbound     2220
Westbound     2220
Unknown        383
Name: stop_primary_direction, dtype: int64

In [102]:
m1.loc[m1.stop_primary_direction == "Unknown"][["route_long_name", "Route ID"]].drop_duplicates()

Unnamed: 0,route_long_name,Route ID
0,Metro A Line (Blue),801
1,Metro A-Line,801
66,Metro B Line (Red),802
67,Metro B-Line,802
132,Metro C Line (Green),803
133,Metro C-Line,803
198,Metro L Line (Gold),804
199,Metro E-Line,804
264,Metro D Line (Purple),805
265,Metro D-Line,805


In [103]:
m1.groupby(["route_long_name","Route ID", "stop_primary_direction"]).agg({'Period':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Period
route_long_name,Route ID,stop_primary_direction,Unnamed: 3_level_1
,DSE,Northbound,10
,DSE,Southbound,8
C & K Line Link (857),857,Unknown,24
Metro A Line,801,Northbound,12
Metro A Line,801,Southbound,12
Metro A Line (Blue),801,Unknown,6
Metro A-Line,801,Unknown,36
Metro B Line,802,Eastbound,12
Metro B Line,802,Westbound,12
Metro B Line (Red),802,Unknown,6
