# LACMTA
* GitHub Issue https://github.com/cal-itp/data-analyses/issues/1727 
* There is something up with the way I aggregate the number of unique routes LACMTA runs.
* LACMTA runs way more than 5 routes.

In [1]:
import _ct_district_grain_data_prep as _ct_district_data_prep
import geopandas as gpd
import pandas as pd
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
district = "07 - Los Angeles / Ventura"

In [4]:
# Load Datasets
operator_df = _ct_district_data_prep.data_wrangling_operator_profile(district)

In [5]:
operator_df.analysis_name.unique()

array(['City of Bell Gardens',
       'Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)',
       'Los Angeles World Airports', 'Los Angeles County',
       'City of Lawndale',
       'Los Angeles County Metropolitan Transportation Authority',
       'City of Commerce', 'City of Torrance', 'Long Beach Transit',
       'City of Artesia', 'City of Baldwin Park',
       'Antelope Valley Transit Authority', 'City of Alhambra',
       'City of Maywood', 'City of Norwalk', 'City of Carson',
       'City of Santa Monica', 'City of Burbank', 'City of Montebello',
       'FlixBus and Greyhound', 'City of Glendale', 'City of Cerritos',
       'City of Pasadena', 'City of Huntington Park',
       'City of San Fernando', 'City of West Covina', 'City of Bell',
       'City of Glendora', 'City of Culver City', 'City of Gardena',
       'City of Inglewood', 'City of Monterey Park', 'City of Calabasas',
       'City of Lynwood', 'Palos Verdes Peninsula 

In [6]:
operator_df.service_date.unique()

array(['2025-09-24T00:00:00.000000000', '2025-07-16T00:00:00.000000000',
       '2025-06-11T00:00:00.000000000', '2025-03-12T00:00:00.000000000',
       '2024-12-11T00:00:00.000000000', '2024-09-18T00:00:00.000000000',
       '2024-06-12T00:00:00.000000000', '2024-05-22T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [7]:
lacmta = operator_df.loc[
    operator_df.analysis_name
    == "Los Angeles County Metropolitan Transportation Authority"
]

In [8]:
lacmta.T

Unnamed: 0,6109,3944
schedule_gtfs_dataset_key,d4a07b520153bd5c27860835bc42610e,7ada6c55e4a29f4535e84c504a994b14
operator_n_routes,108,6
operator_n_trips,13316,1196
operator_n_shapes,607,12
operator_n_stops,11818,111
operator_n_arrivals,849676,25502
operator_route_length_miles,1927.89,130.64
operator_arrivals_per_stop,71.90,229.75
n_downtown_local_routes,95,0
n_local_routes,1,0


## Read in original file

In [9]:
OPERATOR_PROFILE = GTFS_DATA_DICT.digest_tables.operator_profiles

In [None]:
operator_og_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_PROFILE}.parquet",
)

### For `analysis_name` values with more than 1 name associated with it, try to figure out if each row holds different info

In [None]:
operator_og_df.groupby(["analysis_name"]).agg(
    {"name": "nunique", "schedule_gtfs_dataset_key": "nunique"}
).sort_values(by=["name"], ascending = False).head(15)

In [None]:
sf = operator_og_df.loc[
    operator_og_df.analysis_name
    == "City and County of San Francisco"
]

In [None]:
sf[["service_date", "schedule_gtfs_dataset_key", "operator_n_routes"]].sort_values(
    by=["service_date"]
)

In [None]:
lacmta = operator_og_df.loc[
    operator_og_df.analysis_name
    == "Los Angeles County Metropolitan Transportation Authority"
]

In [None]:
lacmta.head(1)

### I can see LACMTA is split among three different `schedule_gtfs_datset_key` values which is why the routes are split.

In [None]:
lacmta.groupby(["analysis_name"]).agg({"schedule_gtfs_dataset_key": "nunique"})

In [None]:
lacmta[["service_date", "schedule_gtfs_dataset_key", "operator_n_routes"]].sort_values(
    by=["service_date"]
).tail()

## Edit `_ct_district_grain_data.py/data_wrangling_operator_profile`

In [None]:
# Since schedule_gtfs_dataset_key changes but not name (hopefully), use name
lacmta2 = lacmta.sort_values(by=["service_date"], ascending=False).drop_duplicates(
    subset=["analysis_name", "name"]
)

In [None]:
lacmta2.T

## Edit `create_gtfs_stats`

In [None]:
gtfs_service_cols = [c for c in lacmta2.columns if "operator_" in c]

In [None]:
gtfs_table_df = lacmta2[gtfs_service_cols + ["analysis_name"]].reset_index(drop=True)

In [None]:
gtfs_table_df

In [None]:
gtfs_table_df.groupby(['analysis_name']).agg("sum").reset_index()

In [None]:
gtfs_table_df