# LACMTA
* GitHub Issue https://github.com/cal-itp/data-analyses/issues/1727 
* There is something up with the way I aggregate the number of unique routes LACMTA runs.
* LACMTA runs way more than 5 routes.

In [1]:
import _ct_district_grain_data_prep as _ct_district_data_prep
import geopandas as gpd
import pandas as pd
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
district = "07 - Los Angeles / Ventura"

In [4]:
from shared_utils import catalog_utils, rt_dates

In [5]:
rt_dates.get_week("oct2025", exclude_wed=True)

['2025-10-14', '2025-10-16']

In [None]:
# Load Datasets
operator_df = _ct_district_data_prep.data_wrangling_operator_profile(district)

In [None]:
operator_df.analysis_name.unique()

In [None]:
operator_df.service_date.unique()

In [None]:
lacmta = operator_df.loc[
    operator_df.analysis_name
    == "Los Angeles County Metropolitan Transportation Authority"
]

In [None]:
lacmta.T

## Read in original file

In [None]:
OPERATOR_PROFILE = GTFS_DATA_DICT.digest_tables.operator_profiles

In [None]:
operator_og_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_PROFILE}.parquet",
)

### For `analysis_name` values with more than 1 name associated with it, try to figure out if each row holds different info

In [None]:
operator_og_df.groupby(["analysis_name"]).agg(
    {"name": "nunique", "schedule_gtfs_dataset_key": "nunique"}
).sort_values(by=["name"], ascending = False).head(15)

In [None]:
sf = operator_og_df.loc[
    operator_og_df.analysis_name
    == "City and County of San Francisco"
]

In [None]:
sf[["service_date", "schedule_gtfs_dataset_key", "operator_n_routes"]].sort_values(
    by=["service_date"]
)

In [None]:
lacmta = operator_og_df.loc[
    operator_og_df.analysis_name
    == "Los Angeles County Metropolitan Transportation Authority"
]

In [None]:
lacmta.head(1)

### I can see LACMTA is split among three different `schedule_gtfs_datset_key` values which is why the routes are split.

In [None]:
lacmta.groupby(["analysis_name"]).agg({"schedule_gtfs_dataset_key": "nunique"})

In [None]:
lacmta[["service_date", "schedule_gtfs_dataset_key", "operator_n_routes"]].sort_values(
    by=["service_date"]
).tail()

## Edit `_ct_district_grain_data.py/data_wrangling_operator_profile`

In [None]:
# Since schedule_gtfs_dataset_key changes but not name (hopefully), use name
lacmta2 = lacmta.sort_values(by=["service_date"], ascending=False).drop_duplicates(
    subset=["analysis_name", "name"]
)

In [None]:
lacmta2.T

## Edit `create_gtfs_stats`

In [None]:
gtfs_service_cols = [c for c in lacmta2.columns if "operator_" in c]

In [None]:
gtfs_table_df = lacmta2[gtfs_service_cols + ["analysis_name"]].reset_index(drop=True)

In [None]:
gtfs_table_df

In [None]:
gtfs_table_df.groupby(['analysis_name']).agg("sum").reset_index()

In [None]:
gtfs_table_df