# Download data from warehouse -> add it to our private GCS bucket as a parquet -> clean it -> add it to the public GCS bucket

In [None]:
from datetime import datetime
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas

import pandas as pd
from functools import cache

import publish_public_data
from calitp_data_analysis import geography_utils
from shared_utils import gtfs_utils_v2, portfolio_utils, publish_utils, geo_utils
from update_vars import GTFS_DATA_DICT

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS

In [None]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

In [None]:
date_str = datetime.today().strftime("%Y_%m")

In [None]:
date_str

## 1) schedule_rt_route_direction_summary

In [None]:
def download_schedule_rt_route_direction_summary(download_date: str) -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_monthly_schedule_rt_route_direction_summary",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/schedule_rt_route_direction_summary_{download_date}.parquet"
    )

    return df

In [None]:
def prep_schedule_rt_route_direction_summary(download_date: str) -> pd.DataFrame:
    df = download_schedule_rt_route_direction_summary(download_date)

    df2 = df[
        [
            "month_first_day",
            "name",
            "route_name",
            "direction_id",
            "frequency_all_day",
            "frequency_offpeak",
            "frequency_peak",
            "daily_service_hours",
            "daily_trips_peak",
            "daily_trips_offpeak",
            "daily_trips_all_day",
            "day_type",
        ]
    ]
    # Temporarily add analysis_name
    df2 = df2.pipe(portfolio_utils.standardize_portfolio_organization_names)
    df2 = df2.drop_duplicates()

    # Clean columns
    df2.columns = df2.columns.str.replace("_", " ").str.title()
    df2["Month First Day"] = pd.to_datetime(df2["Month First Day"]).dt.strftime("%B %Y")
    df2 = df2.rename(
        columns={
            "Direction Id": "Direction",
            "Month First Day": "Date",
            "Route Name": "Route",
        }
    )
    # Add some new columns
    df2["Daily Service Minutes"] = df2["Daily Service Hours"] * 60
    df2["Average Scheduled Minutes"] = (
        df2["Daily Service Minutes"] / df2["Daily Trips All Day"]
    )
    df2["Average Scheduled Minutes"].describe()
    df2["Headway All Day"] = 60 / df2["Frequency All Day"]
    df2["Headway Peak"] = 60 / df2["Frequency Peak"]
    df2["Headway Offpeak"] = 60 / df2["Frequency Offpeak"]

    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/schedule_rt_route_direction_summary_{download_date}.parquet"
    )
    return df2

In [None]:
# schedule_rt_route_direction_summary = prep_schedule_rt_route_direction_summary(date_str)

## 2) fct_monthly_operator_summary

In [None]:
def download_operator_summary(download_date: str) -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_monthly_operator_summary",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/fct_monthly_operator_summary_{download_date}.parquet"
    )
    return df

In [None]:
def clean_operator_summary(download_date: str) -> pd.DataFrame:
    df = download_operator_summary(download_date)

    df2 = df[
        [
            "month_first_day",
            "analysis_name",
            "vp_name",
            "tu_name",
            "n_trips",
            "day_type",
            "daily_trips",
            "ttl_service_hours",
            "n_routes",
            "n_days",
            "n_shapes",
            "n_stops",
            "vp_messages_per_minute",
            "n_vp_trips",
            "daily_vp_trips",
            "pct_vp_trips",
            "n_vp_routes",
            "pct_vp_service_hours",
            "tu_messages_per_minute",
            "n_tu_trips",
            "daily_tu_trips",
            "pct_tu_trips",
            "n_tu_routes",
            "pct_tu_service_hours",
        ]
    ]
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(
        columns={
            "Month First Day": "Date",
        }
    )
    df2.columns = df2.columns.str.replace("Vp", "VP").str.replace("Tu", "TU")

    # Create a couple of new columns
    df2["Percent of Trips with Trip Updates"] = (
        df2["N TU Trips"] / df2["N Trips"]
    ) * 100

    df2["Percent of Trips with Vehicle Positions"] = (
        df2["N VP Trips"] / df2["N Trips"]
    ) * 100

    """
    df2["Percent of Trips with Vehicle Positions"] = df2[
        "Percent of Trips with Vehicle Positions"
    ].clip(upper=100.0)
    df2["Percent of Trips with Trip Updates"] = df2[
        "Percent of Trips with Trip Updates"
    ].clip(upper=100.0)
    """
    # Save
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/fct_monthly_operator_summary_{download_date}.parquet"
    )
    return df2

In [None]:
monthly_operator_summary_clean = clean_operator_summary(date_str)

## 3) fct_monthly_routes 

In [None]:
def download_fct_monthly_routes(download_date: str) -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_monthly_routes",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/fct_monthly_routes_{download_date}.parquet"
    )
    return df

In [None]:
# fct_monthly_routes = download_fct_monthly_routes()

In [None]:
def clean_fct_monthly_routes(download_date: str) -> pd.DataFrame:
    df = download_fct_monthly_routes(download_date)

    # Add analysis name
    df = df.pipe(portfolio_utils.standardize_portfolio_organization_names)

    # Keep only the most recent route geography
    df2 = df.sort_values(
        by=["month_first_day", "analysis_name", "route_name"],
        ascending=[False, True, True],
    ).drop_duplicates(subset=["analysis_name", "route_name"])

    df2 = df.drop(
        columns=[
            "shape_id",
            "shape_array_key",
            "n_trips",
            "direction_id",
        ]
    )

    # Add route_type, route_color, and route_typology
    route_info_df = pd.read_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/schedule_rt_route_direction_summary_{download_date}.parquet"
    )[
        [
            "name",
            "month_first_day",
            "route_name",
            "route_type",
            "route_color",
            "route_typology",
        ]
    ]

    m1 = pd.merge(
        df2,
        route_info_df,
        on=[
            "name",
            "month_first_day",
            "route_name",
        ],
        how="left",
    ).drop(columns = ["month_first_day"])

    m1 = geo_utils.convert_to_gdf(m1, "pt_array", "line")
    """
    # Convert the geometry to line
    
    m1 = gcs_geopandas().geo_data_frame_to_parquet(
        m1,
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/fct_monthly_routes_{download_date}.parquet"
    )
    """
    return m1

In [None]:
fct_operator_hourly_summary = clean_fct_monthly_routes(date_str)

## 4) fct_operator_hourly_summary

In [None]:
def download_fct_operator_hourly_summary(download_date: str) -> pd.DataFrame:
    df = _sql_query.download_with_pandas_gbq(
        project="cal-itp-data-infra-staging",
        filename="tiffany_mart_gtfs_rollup.fct_operator_hourly_summary",
    )

    # Save to our private GCS
    df.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}raw/fct_operator_hourly_summary_{download_date}.parquet"
    )
    return df

In [None]:
# fct_operator_hourly_summary = download_fct_operator_hourly_summary()

In [None]:
def clean_fct_operator_hourly_summary(download_date: str) -> pd.DataFrame:
    df = download_fct_operator_hourly_summary(download_date)

    df2 = (
        df.groupby(["analysis_name", "month_first_day", "day_type", "departure_hour"])
        .agg({"n_trips": "sum"})
        .reset_index()
    )
    
    
    df2.columns = df2.columns.str.replace("_", " ").str.title()

    df2 = df2.rename(columns={"Month First Day": "Date"})

    df2['Date'] = df2['Date'].dt.strftime('%m-%Y')
    
    df2.to_parquet(
        f"{GTFS_DATA_DICT.gcs_paths.DIGEST_GCS}processed/fct_operator_hourly_summary_{download_date}.parquet"
    )
    return df2

In [None]:
# clean_fct_operator_hourly_summary_df = clean_fct_operator_hourly_summary(date_str)

## Publish everything to GCS

In [None]:
GCS = GTFS_DATA_DICT["digest_tables_ah"].dir

In [None]:
GTFS_DATA_DICT["digest_tables_ah"].schedule_rt_route_direction_summary

In [None]:
file_names = [
    GTFS_DATA_DICT["digest_tables_ah"][f]
    for f in [
        "schedule_rt_route_direction_summary",
        "fct_monthly_operator_summary",
        "fct_operator_hourly_summary",
    ]
]

In [None]:
file_names = [f"{key}_{date_str}.parquet" for key in file_names]

In [None]:
file_names = [f"{GCS}{key}" for key in file_names]

In [None]:
for f in file_names:
    publish_utils.write_to_public_gcs(f, f"gtfs_digest/{Path(f).name}", PUBLIC_GCS)

In [None]:
for f in file_names:
    publish_public_data.export_parquet_as_csv_or_geojson(f, filetype="df")