## Incorporating `NTD` stuff directly into `operator_profiles`
* How do I rerun everything and make sure the files are update with all the dates?

In [1]:
from datetime import datetime

import _operators_prep as op_prep
import _report_utils
import _section1_utils as section1
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
# Warehouse
import os
from calitp_data_analysis.sql import query_sql
from calitp_data_analysis.tables import tbls
from siuba import *

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Test Script

In [4]:
import sys

sys.path.append("../gtfs_funnel")
import ntd_annual_database_agency

In [5]:
ntd_test = ntd_annual_database_agency.merge_ntd_mobility(2022)

### This is the dataframe I use for my GTFS Digest portfolio work.

In [None]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
GTFS_DATA_DICT.digest_tables.dir

In [None]:
GTFS_DATA_DICT.digest_tables.operator_profiles

In [None]:
op_profiles = pd.read_parquet(op_profiles_url)

#### Figuring out where `operator_profiles` is created
* [Where](https://github.com/cal-itp/data-analyses/blob/ce779ede5c48f84e974615a457be77cba2f8b08b/gtfs_digest/merge_operator_data.py#L109) `operator_profiles` is stacked for all the available dates.
* `GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats` is the actual file being stacked that is being created here in [GTFS Funnel](https://github.com/cal-itp/data-analyses/blob/ce779ede5c48f84e974615a457be77cba2f8b08b/gtfs_funnel/operator_scheduled_stats.py#L147)

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

In [None]:
may_24 = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_scheduled_stats_2024-05-26.parquet"
)

In [None]:
may_24.head(1)

#### `gtfs_funnel/operator_scheduled_stats`
* Add `ntd` stuff somewhere around 164-174
* How often is the NTD data updated in our warehouse?
* So far I am querying 2022 data.
* Where do I add this NTD stuff?

#### Checkout NTD

In [None]:
def load_ntd(year: int) -> pd.DataFrame:
    """
    Load NTD Data stored in our warehouse.
    """
    df = (
        tbls.mart_ntd.dim_annual_ntd_agency_information()
        >> filter(_.year == year, _.state == "CA", _._is_current == True)
        >> select(
            _.number_of_state_counties,
            _.uza_name,
            _.density,
            _.number_of_counties_with_service,
            _.state_admin_funds_expended,
            _.service_area_sq_miles,
            _.population,
            _.service_area_pop,
            _.subrecipient_type,
            _.primary_uza,
            _.reporter_type,
            _.organization_type,
            _.agency_name,
            _.voms_pt,
            _.voms_do,
        )
        >> collect()
    )

    cols = list(df.columns)

    # Put rows with a lot of nans at the bottom
    # because some of the operators have 1+ entry.
    df2 = df.sort_values(by=cols, na_position="last")

    # Keep only the rows with the most data.
    df3 = df2.groupby("agency_name").first().reset_index()

    return df3

In [None]:
ntd = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.year == 2022, _.state == "CA", _._is_current == True)
    >> select(
        _.number_of_state_counties,
        _.uza_name,
        _.density,
        _.number_of_counties_with_service,
        _.state_admin_funds_expended,
        _.service_area_sq_miles,
        _.population,
        _.service_area_pop,
        _.subrecipient_type,
        _.primary_uza,
        _.reporter_type,
        _.organization_type,
        _.agency_name,
        _.voms_pt,
        _.voms_do,
    )
    >> collect()
)

In [None]:
ntd2 = ntd.sort_values(by=list(ntd.columns), na_position="last")

In [None]:
ntd.shape

In [None]:
ntd.loc[ntd.agency_name == "Kern Regional Transit"]

In [None]:
ntd2.loc[ntd2.agency_name == "Kern Regional Transit"]

In [None]:
ntd.loc[ntd.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd2.loc[ntd2.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd.agency_name.value_counts().head(10)

In [None]:
ntd3 = ntd2.groupby("agency_name").first().reset_index()

In [None]:
ntd3.loc[ntd3.agency_name == "San Luis Obispo Regional Transit Authority"]

In [None]:
ntd3.head(1).T

In [None]:
ntd4 = (
    tbls.mart_ntd.dim_annual_ntd_agency_information()
    >> filter(_.year == 2022, _.state == "CA", _._is_current == True)
    >> collect()
)

In [None]:
ntd4.head(1).T

#### Checkout `mobility` 
* Need this because there is additional columns here that isn't in NTD.

In [None]:
mob_og = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
    >> collect()
)

In [None]:
mob_og.head(1).T

In [None]:
mob = (
    tbls.mart_transit_database.dim_mobility_mart_providers()
    >> select(
        _.agency_name,
        _.counties_served,
        _.hq_city,
        _.hq_county,
        _.is_public_entity,
        _.is_publicly_operating,
        _.funding_sources,
        _.on_demand_vehicles_at_max_service,
        _.vehicles_at_max_service,
    )
    >> collect()
)

In [None]:
mob.head(1).T

In [None]:
mob.agency_name.value_counts().head(10)

In [None]:
mob.loc[mob.agency_name == "Kern Regional Transit"]

In [None]:
mob2 = mob.sort_values(
    by=["on_demand_vehicles_at_max_service", "vehicles_at_max_service"],
    ascending=[False, False],
)

In [None]:
mob2.loc[mob2.agency_name == "Kern Regional Transit"]

In [None]:
mob3 = mob2.groupby('agency_name').first().reset_index()

In [None]:
mob3.loc[mob3.agency_name == "Kern Regional Transit"]

### I made a minor change to loading `mobility` warehouse data (just deleting an unncessary line). Otherwise, the functions are good to go.

In [None]:
def merge_ntd_mobility(year:int)->pd.DataFrame:
    ntd = section1.load_ntd(year)
    mobility = section1.load_mobility()
    m1 = pd.merge(
    mobility,
    ntd,
    how="inner",
    on="agency_name")
    agency_dict = {
    "City of Fairfield, California": "City of Fairfield",
    "Livermore / Amador Valley Transit Authority": "Livermore-Amador Valley Transit Authority",
    "Nevada County Transit Services": "Nevada County",
    "Omnitrans": "OmniTrans"}
    
    m1.agency_name = m1.agency_name.replace(agency_dict)
    m1.agency_name = m1.agency_name.str.strip()
    m1 = m1.drop_duplicates(subset = ["agency_name"]).reset_index(drop = True)
    return m1

In [None]:
m1 = merge_ntd_mobility(2022)

In [None]:
m1.shape

In [None]:
m1.agency_name.nunique()

In [None]:
m1.agency_name.value_counts().head()

In [None]:
m1.loc[m1.agency_name == "Redding Area Bus Authority"]

### Checkout Route Typology
* All this work lives in another script `gtfs_Funnel/route_typologies.py`

In [None]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [None]:
SCHED_GCS

In [None]:
ROUTE_TYPOLOGY

In [None]:
apr_24_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_2024-04-19.parquet"

In [None]:
apr_24_df = pd.read_parquet(apr_24_url)

In [None]:
apr_24_df.head(2)