In [None]:
import pandas as pd
from siuba import *

In [None]:
import sys
sys.path.append('../../ntd/')
from update_vars import GCS_FILE_PATH, PUBLIC_FILENAME, YEAR, MONTH

## Read NTD using Tiffany's functions from `ntd/`

In [None]:
df = pd.read_parquet(
    f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet"
).drop(
    columns = ["Mode", "TOS"]
).rename(columns = {"Mode_full": "Mode", "TOS_full": "TOS"})

In [None]:
# find columns that are recent enough to plot
MIN_YEAR = 2018

not_id_cols = [c for c in df.columns if "/" in c]

recent_years = [
    c for c in not_id_cols if int(c.split("/")[1]) >= MIN_YEAR and 
    "pct" not in c
]

upt_cols = [
    c for c in recent_years if "change" not in c
]

change_cols = [c for c in recent_years if "change" in c]

In [None]:
def sum_by_group(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
    """
    Since df is wide, use pivot_table() to sum up all
    the columns that show UPT.
    """
    grouped_df = df.pivot_table(
        index = group_cols, 
        values = recent_years, 
        aggfunc="sum"
    ).reset_index().reindex(columns = group_cols + recent_years)
    
    return grouped_df

def make_long(df: pd.DataFrame, group_cols: list, value_cols: list):
    df_long = df[group_cols + value_cols].melt(
        id_vars = group_cols, 
        value_vars = value_cols,
    )
    
    df_long = df_long.assign(
        variable = df_long.variable.str.replace("change_1yr_", "")
    )
    
    return df_long

In [None]:
agency_cols = ["NTD ID", "Agency", "RTPA"]
mode_cols = ["Mode", "RTPA"]
tos_cols = ["TOS", "RTPA"]

by_agency = sum_by_group(df, agency_cols)
by_mode = sum_by_group(df, mode_cols)
by_tos = sum_by_group(df, tos_cols)

In [None]:
by_agency.columns

In [None]:
sep_agency = by_agency[['NTD ID', 'Agency', 'RTPA', '9/2023']].rename(columns={'9/2023': 'upt_sep_2023'})

In [None]:
sep_agency >> head(3)

In [None]:
from calitp_data_analysis.tables import tbls

In [None]:
ntd_organziations = (tbls.mart_transit_database.dim_organizations()
                        >> filter(_._is_current)
                        >> select(_.source_record_id, _.name, _.ntd_id)
                        >> collect()
                    )

In [None]:
ntd_organziations = ntd_organziations >> inner_join(_, sep_agency, on = {'ntd_id': 'NTD ID'})

In [None]:
ntd_organziations >> head(3)

In [None]:
service = pd.read_excel('./2022 Service.xlsx')

In [None]:
from calitp_data_analysis.sql import to_snakecase

In [None]:
service = to_snakecase(service)
service = service >> filter(_.time_period == 'Average Typical Weekday')

In [None]:
service.columns

In [None]:
service_grouped = service >> group_by('ntd_id') >> summarize(weekday_vrh = _.actual_vehicle_passenger_car_revenue_hours.sum(),
                                                            weekday_vrm = _.actual_vehicles_passenger_car_revenue_miles.sum())

In [None]:
service_grouped.dtypes

In [None]:
service_grouped.ntd_id = service_grouped.ntd_id.astype(str)
service_grouped.ntd_id = service_grouped.ntd_id.str.zfill(5)

In [None]:
ntd_organziations['weekday_ridership'] = ntd_organziations.upt_sep_2023 // 21 #  rough estimate

In [None]:
with_service = ntd_organziations >> inner_join(_, service_grouped, on = 'ntd_id')

In [None]:
with_service

## add uza

In [None]:
info = pd.read_excel('2022 Agency Information.xlsx')
info = to_snakecase(info)

In [None]:
info.columns

In [None]:
info = info >> filter(_.state == 'CA') >> select(_.ntd_id, _.agency_name, _.primary_uza_uace_code,
                                          _.uza_name)

In [None]:
info

In [None]:
info.ntd_id = info.ntd_id.astype(str)
info.ntd_id = info.ntd_id.str.zfill(5)

In [None]:
with_info = with_service >> inner_join(_, info, on = 'ntd_id')

In [None]:
uza_totals = with_info >> group_by(_.primary_uza_uace_code, _.uza_name) >> summarize(uza_ridership = _.weekday_ridership.sum())

In [None]:
market_shares = (with_info >> inner_join(_, uza_totals, on = ['uza_name', 'primary_uza_uace_code'])
           >> mutate(uza_share = _.weekday_ridership / _.uza_ridership)
)

In [None]:
market_shares >> filter(_.uza_name.str.contains('Los Angeles')) >> arrange(-_.uza_share)

In [None]:
market_shares.to_parquet('intermediate/draft_ntd_market_shares.parquet')