# Trips full table v1 vs v2

Use this to figure out how to do exclusion between v1 and v2. 

Is it simply excluding 511 regional feed?

In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(2_000_000_000_000)

import geopandas as gpd
import pandas as pd
import sys

from loguru import logger
from calitp.tables import tbls
from siuba import *

import gtfs_utils_v2
from shared_utils import rt_dates



In [10]:
analysis_date = rt_dates.DATES["oct2022"]

In [6]:
dim_gtfs_datasets = (
    tbls.mart_transit_database.dim_gtfs_datasets()
    >> filter(_["data"] == "GTFS Schedule")
    >> rename(gtfs_dataset_key = "key")
    >> select(_.gtfs_dataset_key, _.name, _.regional_feed_type)
    >> distinct()
)

In [8]:
print(dim_gtfs_datasets >> show_query())

SELECT DISTINCT `mart_transit_database.dim_gtfs_datasets_1`.`key` AS `gtfs_dataset_key`, `mart_transit_database.dim_gtfs_datasets_1`.`name`, `mart_transit_database.dim_gtfs_datasets_1`.`regional_feed_type` 
FROM `mart_transit_database.dim_gtfs_datasets` AS `mart_transit_database.dim_gtfs_datasets_1` 
WHERE `mart_transit_database.dim_gtfs_datasets_1`.`data` = 'GTFS Schedule'
# Source: lazy query
# DB Conn: Engine(bigquery://cal-itp-data-infra/?maximum_bytes_billed=2000000000000)
# Preview:
    gtfs_dataset_key                    name       regional_feed_type
0  recsBJKl0jkyqfLk4  Thousand Oaks Schedule  Regional Precursor Feed
1  reciKWkJ953NSPTtj        G Trans Schedule                     None
2  reczps9Ejby9P7Njr  SunLine Avail Schedule                     None
3  recM9LZoHwKzjuhPM      La Puente Schedule                     None
4  rec3FmQFpaxdDOpwF        Burbank Schedule                     None
# .. may have more rows


In [11]:
fact_feeds = (
    tbls.mart_gtfs.fct_daily_schedule_feeds()
    >> filter((_.date == analysis_date))
    >> inner_join(_, dim_gtfs_datasets, 
                  on = "gtfs_dataset_key"
                 )
)  

In [12]:
print(fact_feeds >> show_query())

SELECT `anon_1`.`key`, `anon_1`.`date`, `anon_1`.`feed_key`, `anon_1`.`base64_url`, `anon_1`.`gtfs_dataset_key`, `anon_1`.`is_future`, `anon_2`.`regional_feed_type`, `anon_2`.`name` 
FROM (SELECT `mart_gtfs.fct_daily_schedule_feeds_1`.`key` AS `key`, `mart_gtfs.fct_daily_schedule_feeds_1`.`date` AS `date`, `mart_gtfs.fct_daily_schedule_feeds_1`.`feed_key` AS `feed_key`, `mart_gtfs.fct_daily_schedule_feeds_1`.`base64_url` AS `base64_url`, `mart_gtfs.fct_daily_schedule_feeds_1`.`gtfs_dataset_key` AS `gtfs_dataset_key`, `mart_gtfs.fct_daily_schedule_feeds_1`.`is_future` AS `is_future` 
FROM `mart_gtfs.fct_daily_schedule_feeds` AS `mart_gtfs.fct_daily_schedule_feeds_1` 
WHERE `mart_gtfs.fct_daily_schedule_feeds_1`.`date` = '2022-10-12') AS `anon_1` JOIN (SELECT DISTINCT `mart_transit_database.dim_gtfs_datasets_1`.`key` AS `gtfs_dataset_key`, `mart_transit_database.dim_gtfs_datasets_1`.`name` AS `name`, `mart_transit_database.dim_gtfs_datasets_1`.`regional_feed_type` AS `regional_feed_type`

In [2]:
dates = [
        rt_dates.DATES["oct2022"], 
        rt_dates.DATES["nov2022"]
    ]

['2022-10-12', '2022-11-16']

In [3]:
for d in dates:
    
    daily_feeds = gtfs_utils_v2.daily_feed_to_organization(
        selected_date = d,
        get_df = True
    )

    daily_feeds.to_parquet(f"./data/daily_feeds_orgs_{d}.parquet")

In [4]:
daily_feeds = pd.read_parquet(f"./data/daily_feeds_orgs_{dates[0]}.parquet")

include_feeds = daily_feeds.feed_key.unique().tolist()


In [6]:
trips = gtfs_utils_v2.get_trips(
    selected_date = dates[0], 
    subset_feeds = include_feeds,
    get_df = True
)
        
trips.to_parquet(f"./data/trips_{dates[0]}.parquet")

In [7]:
daily_feeds = pd.read_parquet(f"./data/daily_feeds_orgs_{dates[1]}.parquet")

include_feeds = daily_feeds.feed_key.unique().tolist()

trips = gtfs_utils_v2.get_trips(
    selected_date = dates[1], 
    subset_feeds = include_feeds,
    get_df = True
)
        
trips.to_parquet(f"./data/trips_{dates[1]}.parquet")