## Charts Exploratory

In [1]:
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import google.auth
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from omegaconf import OmegaConf
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS
from shared_utils import portfolio_utils, rt_dates, publish_utils
import merge_data

readable_dict = OmegaConf.load("readable2.yml")
credentials, project = google.auth.default()

import _report_operator_visuals
import _report_route_dir_visuals
import _scheduled_service_hour_visuals

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)


def formatted(number):
    return "{:,}".format(number)

## why are there so few portfolio_names?

In [3]:
OPERATOR_PROFILE_REPORT = GTFS_DATA_DICT.digest_tables.operator_profiles_report

In [4]:
operator_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_PROFILE_REPORT}.parquet",
    )

In [5]:
operator_df.columns

Index(['portfolio_organization_name', 'service_date', 'caltrans_district',
       'operator_n_routes', 'operator_n_trips', 'operator_n_shapes',
       'operator_n_stops', 'operator_n_arrivals',
       'operator_route_length_miles', 'n_downtown_local_routes',
       'n_local_routes', 'n_coverage_routes', 'n_rapid_routes',
       'n_express_routes', 'n_rail_routes', 'n_ferry_routes',
       'vp_per_min_agency', 'spatial_accuracy_agency', 'n_feeds',
       'operator_feeds', 'counties_served', 'service_area_pop',
       'service_area_sq_miles', 'hq_city', 'reporter_type',
       'primary_uza_name'],
      dtype='object')

In [6]:
operator_df.groupby(["caltrans_district"]).agg({"portfolio_organization_name":"nunique"})

Unnamed: 0_level_0,portfolio_organization_name
caltrans_district,Unnamed: 1_level_1
01 - Eureka,2
02 - Redding,1
03 - Marysville / Sacramento,8
04 - Bay Area / Oakland,22
05 - San Luis Obispo / Santa Barbara,7
06 - Fresno / Bakersfield,7
07 - Los Angeles / Ventura,41
08 - San Bernardino / Riverside,6
10 - Stockton,8
11 - San Diego,3


In [None]:
# Comment out and leave this cell right below pandas
# portfolio_name = "Curry Public Transit"
portfolio_name =  "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"

In [None]:
ROUTE_DIR_MONTH_FILE = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp_report

In [None]:
route_dir_month_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_MONTH_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)

In [None]:
OG_ROUTE_DIR_MONTH_FILE = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp
og_month_route_dir_df = pd.read_parquet(
        f"{RT_SCHED_GCS}{OG_ROUTE_DIR_MONTH_FILE}.parquet",
    filters=[[("portfolio_organization_name", "==", portfolio_name)]]
    )

In [None]:
ROUTE_DIR_QTR_FILE = 'digest/quarterly_schedule_vp_metrics_report'
route_dir_qtr_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_QTR_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)


### Some of the names are Route 1 Route 1, Route 16 Route 16...See what's up.
* When we combine `route_short_name` and `route_long_name` sometimes the values are the same for both columns so we get repeated values.

In [None]:
import yaml
with open("../_shared_utils/shared_utils/portfolio_organization_name.yml", "r") as f:
    PORTFOLIO_ORGANIZATIONS_DICT = yaml.safe_load(f)

In [None]:
CLEANED_ROUTE_NAMING = GTFS_DATA_DICT.schedule_tables.route_identification

clean_route_names = pd.read_parquet(f"{SCHED_GCS}{CLEANED_ROUTE_NAMING}.parquet").pipe(
    portfolio_utils.standardize_portfolio_organization_names,
    PORTFOLIO_ORGANIZATIONS_DICT,
)

In [None]:
clean_route_names.columns

In [None]:
clean_route_names= clean_route_names.loc[clean_route_names.portfolio_organization_name == portfolio_name]

In [None]:
clean_route_names[["recent_combined_name", "route_long_name","route_short_name"]].drop_duplicates()

### Lots of duplicate rows are popping up...Fix that

In [None]:
one_test_route = route_dir_month_df.loc[(route_dir_month_df.Date == '2025-05-14T00:00:00.000000000') & (route_dir_month_df.Route =='89 80-89 Coastal Express')]

In [None]:
len(one_test_route)

In [None]:
len(one_test_route.drop_duplicates())

In [None]:
one_test_route.drop_duplicates()

## 

In [None]:
# display(_report_route_dir_visuals.route_filter(route_dir_qtr_df,route_dir_month_df))

## Seeing where the duplicates come from 

In [None]:
date_list = rt_dates.y2025_dates

In [None]:
schedule_route_dir_df = merge_data.concatenate_schedule_by_route_direction(date_list)

In [None]:
# 
schedule_route_dir_df.columns

In [None]:
def preview_one_route_one_date(df:pd.DataFrame):
    try:
        df2 = df.loc[(df.Date == '2025-05-14T00:00:00.000000000') & (df.Route == "87__80-89 Coastal Express")]
    except:
        df2 = df.loc[(df.service_date == '2025-05-14T00:00:00.000000000') & (df.route_id == "4148")]
    return df2

In [None]:
preview_one_route_one_date(schedule_route_dir_df)

### Lots of repeated values in speed

In [None]:
speeds_df = merge_data.concatenate_rt_vs_schedule_by_route_direction(date_list)

In [None]:
speeds_df.columns

In [None]:
preview_one_route_one_date(speeds_df).shape

In [None]:
len(preview_one_route_one_date(speeds_df).drop_duplicates())

In [None]:
# crosswalk_df = merge_data.concatenate_crosswalk_organization(date_list)

In [None]:
crosswalk_df.columns

In [None]:
# crosswalk has no route info
# preview_one_route_one_date(crosswalk_df)

In [None]:
# Applied to df
# standardized_name = merge_data.merge_in_standardized_route_names()
# but can grab the df w/in the func

### More duplicates for rt vs sched

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
        date_list
    )

In [None]:
(preview_one_route_one_date(df_rt_sched))

In [None]:
len((preview_one_route_one_date(df_rt_sched)).drop_duplicates())