Notebook to produce data regarding 2023 Holiday Service:
https://caltrans.sharepoint.com/:w:/s/DOTPMPHQ-DataandDigitalServices/EVEcAgAwsK1AhL7pQDa22TcBlLF5ZLF-SYOGORhrQrIOCA?e=BX6lkA

Find the total org impact of having missing holiday information using the orgs table, not the services table.

In [1]:
%load_ext autoreload


In [2]:
%autoreload 2

from dotenv import load_dotenv
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyairtable import Api
from sklearn.metrics import confusion_matrix

from funcs_vars import excel_col_order, holiday_columns, holidays_plus_ref, text_data_cols, plot_confusion_matrices

load_dotenv()
api = Api(os.getenv('AIRTABLE_TOKEN'))

os.environ["CALITP_BQ_MAX_BYTES"] = str(20_000_000_000)
from calitp_data_analysis.sql import query_sql

In [3]:
# Trying to stay consistent with 
# https://github.com/cal-itp/data-infra/blob/main/airflow/plugins/operators/airtable_to_gcs.py
def all_rows_as_df(base_id, table_name):
    all_rows = api.table(base_id=base_id, table_name=table_name).all()

    df = pd.DataFrame(
        [
            {"id":row["id"], **row["fields"]}
            for row in all_rows
        ]
    )
    return df

def takeout_list(x):
    if x is not np.nan:
        return x[0]

CALIFORNIA_TRANSIT_ID = "appPnJWrQ7ui4UmIl"
ORGS_ID = 'tblFsd8D5oFRqep8Z'

In [4]:
orgs_df = all_rows_as_df(CALIFORNIA_TRANSIT_ID, ORGS_ID)
orgs_df = orgs_df.loc[~orgs_df['ntd_id_2022'].isnull(),]
orgs_df = orgs_df.loc[~orgs_df['Holiday Website Status'].isnull(),]

In [5]:
orgs_df['ntd_id_2022'].nunique()

160

In [6]:
orgs_df['Holiday Website Status'].value_counts()

Current       143
Missing        12
Old             3
Off-Season      2
Name: Holiday Website Status, dtype: int64

In [7]:
orgs_df[['ntd_id_2022','Holiday Website Status']]

Unnamed: 0,ntd_id_2022,Holiday Website Status
1,90023,Current
7,90281,Missing
17,90027,Current
25,90267,Current
27,91088,Current
...,...,...
1349,90121,Current
1368,91093,Current
1381,99424,Current
1386,91059,Current


In [8]:
ntd_ids = list(set(orgs_df['ntd_id_2022'].dropna()))
ntd_ids_for_query = ','.join(map("'{0}'".format, ntd_ids))

In [9]:
upt = query_sql(f"""
SELECT year, ntd_id as ntd_id_2022, agency_name, reporter_type, time_period, sum(unlinked_passenger_trips__upt_) as sum_unlinked_passenger_trips_upt  
FROM `cal-itp-data-infra.mart_ntd.dim_annual_ntd_agency_service` 
where ntd_id in ({ntd_ids_for_query})
and time_period = 'Annual Total'
and year = 2022
group by 1,2,3,4,5;
""", as_df=True)
upt.head()

Unnamed: 0,year,ntd_id_2022,agency_name,reporter_type,time_period,sum_unlinked_passenger_trips_upt
0,2022,90079,SunLine Transit Agency,Full Reporter,Annual Total,2298805.0
1,2022,90200,Kings County Area Public Transit Agency,Full Reporter,Annual Total,490448.0
2,2022,90196,County of Placer,Full Reporter,Annual Total,683109.0
3,2022,90233,Yuma County Intergovernmental Public Transport...,Full Reporter,Annual Total,354065.0
4,2022,90154,Los Angeles County Metropolitan Transportation...,Full Reporter,Annual Total,254688124.0


In [10]:
df = pd.merge(orgs_df[['Name','ntd_id_2022','Holiday Website Status']], upt[['ntd_id_2022','agency_name','sum_unlinked_passenger_trips_upt']],how='left',indicator=True)

In [11]:
df['sum_unlinked_passenger_trips_upt'].isnull().sum()

6

In [12]:
df.shape

(160, 6)

In [13]:
df.loc[df['sum_unlinked_passenger_trips_upt'].isnull(),]

Unnamed: 0,Name,ntd_id_2022,Holiday Website Status,agency_name,sum_unlinked_passenger_trips_upt,_merge
30,City of El Segundo,99449,Off-Season,,,left_only
66,Susanville Indian Rancheria,99256,Missing,,,left_only
105,City of Duarte,90264,Current,,,left_only
106,City of Elk Grove,90205,Current,,,left_only
112,City of Sierra Madre,99447,Missing,,,left_only
124,City of Lawndale,90280,Current,,,left_only


In [14]:
df.head()

Unnamed: 0,Name,ntd_id_2022,Holiday Website Status,agency_name,sum_unlinked_passenger_trips_upt,_merge
0,Long Beach Transit,90023,Current,Long Beach Transit,17409861.0,both
1,City of Lynwood,90281,Missing,City of Lynwood,114161.0,both
2,City of Fresno,90027,Current,City of Fresno,7120464.0,both
3,City of Huntington Park,90267,Current,City of Huntington Park,143920.0,both
4,Glenn County,91088,Current,Glenn Transit Service,19210.0,both


In [15]:
df[['Holiday Website Status','sum_unlinked_passenger_trips_upt']].groupby('Holiday Website Status').sum().div(365).round().astype(int)
# df[['Holiday Website Status','sum_unlinked_passenger_trips_upt']].groupby('Holiday Website Status').sum().plot.bar()

Unnamed: 0_level_0,sum_unlinked_passenger_trips_upt
Holiday Website Status,Unnamed: 1_level_1
Current,1881743
Missing,23468
Off-Season,67
Old,571
