## Correct the grains
* Per [this comment](https://github.com/cal-itp/data-analyses/pull/1259#issuecomment-2423108561)/.

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
import merge_operator_data

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### In your follow-up PR, the operator grain should replicate the relevant portions of that. The operator files in gtfs_analytics_data.yml should only show unique grains.

    operator_profiles (operator grain)
    operator_metrics (operator grain, should be worked into profiles)
    operator_sched_rt_category (check the grain, it might be able to be worked into the above too)
    operator_routes_map (operator-route grain)


#### Open up `operator_sched_rt` to see what's the grain
* Why are all the n_trips 0??
* The grain is `schedule_gtfs_dataset_key` to 3 different rows for each `service_date`

In [4]:
SCHED_RT_CATEGORY = GTFS_DATA_DICT.digest_tables.operator_sched_rt

In [5]:
sched_rt_cat_df = pd.read_parquet(f"{RT_SCHED_GCS}{SCHED_RT_CATEGORY}.parquet")

In [6]:
sched_rt_cat_df.n_trips.describe()

count   17214.00
mean      148.75
std      1270.38
min         0.00
25%         0.00
50%         0.00
75%         0.00
max     34476.00
Name: n_trips, dtype: float64

In [7]:
sched_rt_cat_df.head(5)

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,n_trips
0,014d0998350083249a9eb310635548c2,2023-03-15,schedule_only,0
1,014d0998350083249a9eb310635548c2,2023-03-15,vp_only,0
2,014d0998350083249a9eb310635548c2,2023-03-15,schedule_and_vp,0
3,014d0998350083249a9eb310635548c2,2023-04-12,schedule_only,0
4,014d0998350083249a9eb310635548c2,2023-04-12,vp_only,0


In [8]:
len(sched_rt_cat_df.loc[sched_rt_cat_df.n_trips  > 0])

3670

In [9]:
len(sched_rt_cat_df.loc[sched_rt_cat_df.n_trips  ==  0])

13544

In [10]:
sched_rt_cat_df.shape

(17214, 4)

#### Check the function `merge_operator__data.operator_category_counts_by_date()`

In [11]:
INPUT = GTFS_DATA_DICT.digest_tables.route_schedule_vp
    
operator_category_cols = [
        "schedule_gtfs_dataset_key", "service_date",
        "sched_rt_category"
    ]
df = pd.read_parquet(
        f"{RT_SCHED_GCS}{INPUT}.parquet",
        filters = [[("time_period", "==", "all_day")]],
        columns = operator_category_cols + ["route_id", "direction_id", 
             "n_scheduled_trips", "n_vp_trips"]
    )

In [12]:
df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,route_id,direction_id,n_scheduled_trips,n_vp_trips
0,014d0998350083249a9eb310635548c2,2023-10-11,schedule_only,10866826,1.0,8,0
1,014d0998350083249a9eb310635548c2,2023-11-15,schedule_only,10866826,1.0,8,0


In [13]:
df.n_vp_trips.describe()

count   107748.00
mean        21.14
std         26.78
min          0.00
25%          1.00
50%         13.00
75%         29.00
max        211.00
Name: n_vp_trips, dtype: float64

In [14]:
df.n_scheduled_trips.describe()

count   107748.00
mean        24.26
std         28.18
min          0.00
25%          4.00
50%         15.00
75%         31.00
max        360.00
Name: n_scheduled_trips, dtype: float64

In [15]:
df2 = (df.groupby(operator_category_cols)
           .agg({"n_scheduled_trips": "sum",
                "n_vp_trips": "sum"})
           .reset_index()
    )

In [16]:
df2.head(5)

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,n_scheduled_trips,n_vp_trips
0,014d0998350083249a9eb310635548c2,2023-03-15,schedule_only,0,0
1,014d0998350083249a9eb310635548c2,2023-03-15,vp_only,0,0
2,014d0998350083249a9eb310635548c2,2023-03-15,schedule_and_vp,0,0
3,014d0998350083249a9eb310635548c2,2023-04-12,schedule_only,0,0
4,014d0998350083249a9eb310635548c2,2023-04-12,vp_only,0,0


In [17]:
df2.n_vp_trips.describe()

count   17214.00
mean      132.34
std      1265.75
min         0.00
25%         0.00
50%         0.00
75%         0.00
max     34476.00
Name: n_vp_trips, dtype: float64

In [18]:
len(sched_rt_cat_df.loc[sched_rt_cat_df.n_trips  ==  0])

13544

In [19]:
len(sched_rt_cat_df.loc[sched_rt_cat_df.n_trips  >  0])

3670

### Add `operator_metrics` into the `operator_profiles`

In [20]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates 

In [21]:
analysis_date_list.remove('2024-10-16')

In [22]:
operator_metrics = merge_operator_data.concatenate_operator_level_metrics(analysis_date_list)

In [23]:
# Concat operator profiles
operator_profiles = merge_operator_data.concatenate_operator_stats(analysis_date_list)

In [24]:
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [25]:
ntd_cols = [
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "counties_served",
        "service_area_sq_miles",
        "hq_city",
        "uza_name",
        "service_area_pop",
        "organization_type",
        "primary_uza",
        "reporter_type"
    ]
    
# Merge in NTD data. 
crosswalk_df = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS,
            CROSSWALK,
            analysis_date_list,
            data_type="df",
            columns=ntd_cols
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )

In [26]:
crosswalk_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type,service_date
0,d59cac4ec1a8938fa56334975848fa4e,07 - Los Angeles,Los Angeles,10,El Monte,"Los Angeles--Long Beach--Anaheim, CA",201889,County or Local Government Unit or Department of Transportation,,Reduced Reporter,2023-03-15
1,304a02a98bbf17467014dd1dc3596cfd,04 - Oakland,Sonoma,390,Santa Rosa,"Santa Rosa, CA",484674,County or Local Government Unit or Department of Transportation,,Full Reporter,2023-03-15


#### Merge `operator_profiles` with `operator_metrics`

In [27]:
merge_cols = ["organization_name",
              "schedule_gtfs_dataset_key",
             "service_date",]

In [28]:
operator_metrics.head(2)

Unnamed: 0,organization_name,schedule_gtfs_dataset_key,vp_per_min_agency,spatial_accuracy_agency,service_date
0,Tulare County Regional Transit Agency,0139b1253130b33adcd4b3a4490530d2,2.67,81.1,2023-05-17
1,Tulare County Regional Transit Agency,0139b1253130b33adcd4b3a4490530d2,2.54,86.46,2023-06-14


In [29]:
operator_profiles.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date
0,014d0998350083249a9eb310635548c2,8,137,8,159,3771,79.5,23.72,2,0,6,8,0,0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-10-11
1,014d0998350083249a9eb310635548c2,9,139,10,159,3803,85.88,23.92,2,0,8,10,0,0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-11-15


#### What type of merge should it be?

In [30]:
pd.merge(operator_profiles, operator_metrics, on = merge_cols, how = "outer", indicator = True)[["_merge"]].value_counts()

_merge    
both          1586
left_only     1382
right_only     246
dtype: int64

In [31]:
m1 = pd.merge(operator_profiles, operator_metrics, on = merge_cols, how = "outer")

In [32]:
# Merge
merge_cols = ["schedule_gtfs_dataset_key", "service_date"]
op_profiles_df1 = pd.merge(
        m1, 
        crosswalk_df, 
        on = merge_cols, 
        how = "left"
    )

In [33]:
op_profiles_df1.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date,vp_per_min_agency,spatial_accuracy_agency,caltrans_district,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
0,014d0998350083249a9eb310635548c2,8.0,137.0,8.0,159.0,3771.0,79.5,23.72,2.0,0.0,6.0,8.0,0.0,0.0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-10-11,,,05 - San Luis Obispo,San Luis Obispo,130,San Luis Obispo,"San Luis Obispo, CA",206008,Public Agency or Authority of Transit Service,,Full Reporter
1,014d0998350083249a9eb310635548c2,9.0,139.0,10.0,159.0,3803.0,85.88,23.92,2.0,0.0,8.0,10.0,0.0,0.0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-11-15,,,05 - San Luis Obispo,San Luis Obispo,130,San Luis Obispo,"San Luis Obispo, CA",206008,Public Agency or Authority of Transit Service,,Full Reporter


In [34]:
from shared_utils import gtfs_utils_v2, publish_utils

In [35]:
public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()

In [36]:
type(public_feeds)

list

In [37]:
op_profiles_df2 = (
        op_profiles_df1
        .pipe(
            publish_utils.exclude_private_datasets, 
            col = "schedule_gtfs_dataset_key", 
            public_gtfs_dataset_keys = public_feeds
        ).drop_duplicates(subset = list(op_profiles_df1.columns))
    .reset_index(drop = True))

In [38]:
OPERATOR_PROFILE = GTFS_DATA_DICT.digest_tables.operator_profiles

In [39]:
OPERATOR_PROFILE

'digest/operator_profiles'

In [40]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [41]:
f"{RT_SCHED_GCS}{OPERATOR_PROFILE}_AH_TESTING_.parquet"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_profiles_AH_TESTING_.parquet'

In [42]:
op_profiles_df2.to_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_PROFILE}_AH_TESTING_.parquet"
    )

In [43]:
op_profiles_df2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date,vp_per_min_agency,spatial_accuracy_agency,caltrans_district,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
0,014d0998350083249a9eb310635548c2,8.0,137.0,8.0,159.0,3771.0,79.5,23.72,2.0,0.0,6.0,8.0,0.0,0.0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-10-11,,,05 - San Luis Obispo,San Luis Obispo,130,San Luis Obispo,"San Luis Obispo, CA",206008,Public Agency or Authority of Transit Service,,Full Reporter
1,014d0998350083249a9eb310635548c2,9.0,139.0,10.0,159.0,3803.0,85.88,23.92,2.0,0.0,8.0,10.0,0.0,0.0,SLO Schedule,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,2023-11-15,,,05 - San Luis Obispo,San Luis Obispo,130,San Luis Obispo,"San Luis Obispo, CA",206008,Public Agency or Authority of Transit Service,,Full Reporter


### Test out functions for creating the portfolio
* aka _section1 and _section2

In [44]:
organization_name = "Marin County Transit District"

In [45]:
op_profiles_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_profiles_AH_TESTING_.parquet"
op_profiles_df = pd.read_parquet(
    op_profiles_url,
    filters=[[("organization_name", "==", organization_name)]])

In [46]:
 # Keep only the most recent row
op_profiles_df1 = op_profiles_df.sort_values(by = ['service_date'], ascending = False).head(1)
    

In [47]:
import yaml
with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [48]:
op_profiles_df1

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,service_date,vp_per_min_agency,spatial_accuracy_agency,caltrans_district,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
17,015d67d5b75b5cf2b710bbadadfb75f5,19.0,642.0,62.0,550.0,16330.0,250.09,29.69,24.0,5.0,10.0,16.0,0.0,0.0,Bay Area 511 Marin Schedule,recNOb7pqBRlQVG5e,Marin County Transit District,2024-09-18,2.65,65.17,04 - Oakland,Marin,520,San Rafael,"San Francisco--Oakland, CA",262321,Public Agency or Authority of Transit Service,,Full Reporter


In [49]:
type(op_profiles_df1)

pandas.core.frame.DataFrame

In [51]:
def replace_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replace column names with more readable names found in readable_dict.yml.
    
    Args:
    df (pd.DataFrame): Input DataFrame.
    
    Returns:
    pd.DataFrame: DataFrame with replaced column names.
    """
    
 
    # Input validation
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input 'df' must be a pandas DataFrame")
    
    def replace_single_column(column_name: str) -> str:
        value = readable_dict.get(column_name)
        if isinstance(value, dict):
            return value.get('readable', column_name)
        else:
            return value or column_name
    
    # Rename columns using dictionary comprehension
    df = df.rename(columns={c: replace_single_column(c) for c in df.columns})
    
    return df

In [52]:
replace_column_names(op_profiles_df1)

Unnamed: 0,schedule_gtfs_dataset_key,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop,# Downtown Local Route Types,# Local Route Types,# Coverage Route Types,# Rapid Route Types,# Express Route Types,# Rail Route Types,Transit Operator,Organization ID,Organization,Date,VP per Minute (All Routes),Spatial Accuracy (All Routes),District,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type
17,015d67d5b75b5cf2b710bbadadfb75f5,19.0,642.0,62.0,550.0,16330.0,250.09,29.69,24.0,5.0,10.0,16.0,0.0,0.0,Bay Area 511 Marin Schedule,recNOb7pqBRlQVG5e,Marin County Transit District,2024-09-18,2.65,65.17,04 - Oakland,Marin,520,San Rafael,"San Francisco--Oakland, CA",262321,Public Agency or Authority of Transit Service,,Full Reporter
