This notebook was used to find which are good candidates to be reference days for the holiday service analysis.  It finds the total trips for the agencies in our analysis on either weekdays, saturdays, or sundays.

In [None]:
# %load_ext autoreload



In [17]:

# %autoreload 2

from dotenv import load_dotenv
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyairtable import Api
from sklearn.metrics import confusion_matrix

from funcs_vars import holiday_columns, holidays_plus_ref, text_data_cols, plot_confusion_matrices

load_dotenv()
api = Api(os.getenv('AIRTABLE_TOKEN'))

os.environ["CALITP_BQ_MAX_BYTES"] = str(20_000_000_000)
from calitp_data_analysis.sql import query_sql

# Trying to stay consistent with 
# https://github.com/cal-itp/data-infra/blob/main/airflow/plugins/operators/airtable_to_gcs.py
def all_rows_as_df(base_id, table_name):
    all_rows = api.table(base_id=base_id, table_name=table_name).all()

    df = pd.DataFrame(
        [
            {"id":row["id"], **row["fields"]}
            for row in all_rows
        ]
    )
    return df

def takeout_list(x):
    if x is not np.nan:
        return x[0]

CALIFORNIA_TRANSIT_ID = "appPnJWrQ7ui4UmIl"
SERVICES_ID = 'tbl9YmMrJ14D5oPSV'
GTFS_SERVICES = 'tblnVt5FZ2FZmDjDx'
services_df = all_rows_as_df(CALIFORNIA_TRANSIT_ID, SERVICES_ID)
gtfs_services = all_rows_as_df(CALIFORNIA_TRANSIT_ID, GTFS_SERVICES)

services_df['Total VOMS (NTD) (from Provider)'] = services_df['Total VOMS (NTD) (from Provider)'].apply(takeout_list)
services_df = services_df.loc[~services_df['Holiday Schedule – Veterans Day'].isnull(),]
services_df = services_df.loc[services_df['Public Currently Operating Fixed Route'] == 'Yes',]

int_entities = query_sql("""
SELECT 
gtfs_dataset_name,
service_source_record_id,
gtfs_service_data_source_record_id,
use_subfeed_for_reports,
regional_feed_type
FROM `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` 
WHERE gtfs_dataset_type = 'schedule'
AND
date = '2024-01-29'
AND
gtfs_dataset_name != "Bay Area 511 Regional Schedule"
order by organization_name, gtfs_dataset_name ASC
""", as_df=True)

int_entities = pd.merge(int_entities,gtfs_services[['id','Customer Facing']], how='left', left_on='gtfs_service_data_source_record_id',right_on='id').drop(labels=['id'],axis=1)

int_entities = int_entities.query("use_subfeed_for_reports==False | regional_feed_type=='Regional Subfeed'")

services_plus_service_names = pd.merge(services_df,int_entities[['service_source_record_id','gtfs_dataset_name','Customer Facing']]
                                       ,left_on='id',right_on='service_source_record_id',how='left',indicator=True)

services_plus_service_names = services_plus_service_names.loc[~services_plus_service_names['gtfs_dataset_name'].isnull(),]


In [18]:
days_to_consider = []
for each in holidays_plus_ref:
    days_to_consider += [each['date']]

dates_for_query = ','.join(map("'{0}'".format, days_to_consider))

#DowneyLINK replaced their GTFS name from DowneyLINK Schedule to DowneyLINK GMV Schedule on Jan 1st, 2024.  
#This will help collect all their data from both 2023 and 2024.
transit_names_for_query = ','.join(map("'{0}'".format, services_plus_service_names['gtfs_dataset_name'].values)) + ", 'DowneyLINK Schedule'"


In [81]:
start_date = pd.to_datetime('2023-10-01')
end_date = pd.to_datetime('2024-01-18')

# Create a frequency object for Saturdays
freq = 'W-SAT'  # Weekly on Saturdays

# Generate a DatetimeIndex between the start and end dates with Saturday frequency
date_range = pd.date_range(start_date, end_date, freq=freq)

# Convert the DatetimeIndex to a list of dates (modify the format if desired)
saturday_dates = date_range.strftime('%Y-%m-%d').tolist()

# Print the list of Saturdays
print(saturday_dates)

# Create a frequency object for Saturdays
freq = 'W-SUN'  # Weekly on Saturdays

# Generate a DatetimeIndex between the start and end dates with Saturday frequency
date_range = pd.date_range(start_date, end_date, freq=freq)

# Convert the DatetimeIndex to a list of dates (modify the format if desired)
sunday_dates = date_range.strftime('%Y-%m-%d').tolist()

# Print the list of Saturdays
print(sunday_dates)

date_range = pd.date_range(start_date, end_date)
weekdays_dates = date_range[date_range.dayofweek < 5].strftime('%Y-%m-%d').tolist()
print(weekdays_dates)

# weekdays_data = dates[dates.dayofweek < 5]
saturday_dates = ','.join(map("'{0}'".format, saturday_dates))
sunday_dates = ','.join(map("'{0}'".format, sunday_dates))
weekdays_dates = ','.join(map("'{0}'".format, weekdays_dates))

['2023-10-07', '2023-10-14', '2023-10-21', '2023-10-28', '2023-11-04', '2023-11-11', '2023-11-18', '2023-11-25', '2023-12-02', '2023-12-09', '2023-12-16', '2023-12-23', '2023-12-30', '2024-01-06', '2024-01-13']
['2023-10-01', '2023-10-08', '2023-10-15', '2023-10-22', '2023-10-29', '2023-11-05', '2023-11-12', '2023-11-19', '2023-11-26', '2023-12-03', '2023-12-10', '2023-12-17', '2023-12-24', '2023-12-31', '2024-01-07', '2024-01-14']
['2023-10-02', '2023-10-03', '2023-10-04', '2023-10-05', '2023-10-06', '2023-10-09', '2023-10-10', '2023-10-11', '2023-10-12', '2023-10-13', '2023-10-16', '2023-10-17', '2023-10-18', '2023-10-19', '2023-10-20', '2023-10-23', '2023-10-24', '2023-10-25', '2023-10-26', '2023-10-27', '2023-10-30', '2023-10-31', '2023-11-01', '2023-11-02', '2023-11-03', '2023-11-06', '2023-11-07', '2023-11-08', '2023-11-09', '2023-11-10', '2023-11-13', '2023-11-14', '2023-11-15', '2023-11-16', '2023-11-17', '2023-11-20', '2023-11-21', '2023-11-22', '2023-11-23', '2023-11-24', '20

In [80]:
# weekdays_dates

In [41]:
dates_for_query = ','.join(map("'{0}'".format, saturday_dates))

In [82]:
dates_for_query = sunday_dates
sunday_trips = query_sql(f"""
select name, count(trip_instance_key) as total_trips, service_date
from `cal-itp-data-infra.mart_gtfs.fct_scheduled_trips` 
WHERE
service_date in ({dates_for_query})
AND
name in ({transit_names_for_query})
group by name, service_date
""", as_df=True)

dates_for_query = saturday_dates
saturday_trips = query_sql(f"""
select name, count(trip_instance_key) as total_trips, service_date
from `cal-itp-data-infra.mart_gtfs.fct_scheduled_trips` 
WHERE
service_date in ({dates_for_query})
AND
name in ({transit_names_for_query})
group by name, service_date
""", as_df=True)

dates_for_query = weekdays_dates
weekday_trips = query_sql(f"""
select name, count(trip_instance_key) as total_trips, service_date
from `cal-itp-data-infra.mart_gtfs.fct_scheduled_trips` 
WHERE
service_date in ({dates_for_query})
AND
name in ({transit_names_for_query})
group by name, service_date
""", as_df=True)

In [83]:
weekday_trips['service_date'].value_counts()

2023-12-14    172
2023-11-30    172
2023-12-15    171
2023-11-28    170
2023-12-12    170
             ... 
2023-11-24    148
2024-01-15    131
2023-12-25     79
2023-11-23     77
2024-01-01     76
Name: service_date, Length: 79, dtype: int64

In [97]:
sunday_trips['service_date'].value_counts()

2023-12-10    100
2023-11-26    100
2023-12-17    100
2023-10-29    100
2023-11-05     99
2023-12-03     99
2023-10-15     99
2023-11-19     99
2024-01-14     98
2023-10-22     98
2024-01-07     98
2023-10-01     98
2023-11-12     98
2023-10-08     96
2023-12-31     92
2023-12-24     92
Name: service_date, dtype: int64

In [96]:
saturday_trips['service_date'].value_counts()

2023-11-25    142
2023-12-09    142
2023-12-16    142
2023-11-18    141
2023-11-04    141
2023-10-14    141
2023-12-02    140
2023-10-28    140
2023-10-07    139
2023-10-21    139
2023-12-23    138
2024-01-06    137
2024-01-13    136
2023-12-30    136
2023-11-11    133
Name: service_date, dtype: int64

In [68]:
sunday_trips.groupby('service_date').sum().sort_values(by='total_trips',ascending=False)

  sunday_trips.groupby('service_date').sum().sort_values(by='total_trips',ascending=False)


Unnamed: 0_level_0,total_trips
service_date,Unnamed: 1_level_1
2024-01-14,62105
2023-12-03,60427
2023-10-15,60273
2023-12-17,59938
2023-10-22,59933
2024-01-07,59900
2023-11-26,59894
2023-10-29,59519
2023-10-01,59383
2023-11-05,59367


In [69]:
saturday_trips.groupby('service_date').sum().sort_values(by='total_trips',ascending=False)

  saturday_trips.groupby('service_date').sum().sort_values(by='total_trips',ascending=False)


Unnamed: 0_level_0,total_trips
service_date,Unnamed: 1_level_1
2024-01-13,67535
2023-10-14,67128
2023-12-09,67097
2023-12-16,66560
2023-11-25,66547
2023-10-21,66386
2023-10-07,66379
2023-11-04,66346
2023-11-11,65741
2024-01-06,64632


In [84]:
weekday_trips.groupby('service_date').sum().sort_values(by='total_trips',ascending=False)

  weekday_trips.groupby('service_date').sum().sort_values(by='total_trips',ascending=False)


Unnamed: 0_level_0,total_trips
service_date,Unnamed: 1_level_1
2023-12-04,102366
2023-12-05,102294
2023-12-06,102258
2023-12-07,102212
2023-12-15,102211
...,...
2023-12-20,88037
2024-01-15,87537
2023-11-23,60915
2024-01-01,56175


In [None]:
#I think it's reasonable to use 12-16 and 12-17 as reference days for Saturday and Sunday.  They both have high counts of trips.
# 12-15 is a good date to use for weekdays
