# To SQL

In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl

from calitp import query_sql, magics
import branca

import shared_utils

from siuba import *
import pandas as pd

import datetime as dt
import time
from zoneinfo import ZoneInfo

import importlib

import gcsfs
fs = gcsfs.GCSFileSystem()

from tqdm import tqdm_notebook
from tqdm.notebook import trange, tqdm

from IPython.display import display, Markdown

import utils



In [2]:
pd.set_option("display.max_columns", 100)


In [3]:
analysis_date_start = '2022-05-01'
analysis_date_end = '2022-05-31'

In [4]:
#function to change to SQL
def get_pct_ran_df(itp_id, list_of_dates, gtfs_daily, rt):
    
    pcts = []
    
    # loop through list of dates
    for single_date in list_of_dates:
        
        #filter for single day
        
        gtfs_daily2 = (gtfs_daily>>filter(_.service_date == single_date))
        rt2 = (rt>>filter(_.date == single_date))
        
        #outer join schedule and rt data 
        sched_rt_df = (pd.merge(gtfs_daily2, rt2, how='outer', on='trip_id', indicator='have_rt'))

        
        day_pct_ran = {}
        day_pct_ran['date'] = single_date
        if ((len(sched_rt_df))!=0):
            day_pct_ran['pct_trips_ran'] = ((len(sched_rt_df>>filter(_.have_rt=='both')))/(len(gtfs_daily2)))
        elif ((len(sched_rt_df))==0):
            day_pct_ran['pct_trips_ran'] = ''
        pct_ran = pd.DataFrame([day_pct_ran])
        
        # add columns with counts 
        pct_ran['n_have_rt'] = (len(sched_rt_df>>filter(_.have_rt=='both')))
        pct_ran['n_missing_rt'] = (len(sched_rt_df>>filter(_.have_rt=='right_only')))
        pct_ran['unmatched_rt'] = (len(sched_rt_df>>filter(_.have_rt=='left_only')))
        
        # add columns for number of unique trip_ids
        pct_ran['nunique_sched'] = (gtfs_daily2.trip_id.nunique())
        pct_ran['nunique_rt'] = (rt2.trip_id.nunique())

        pcts.append(pct_ran)                                                    
        #code help from: https://stackoverflow.com/questions/28669482/appending-pandas-dataframes-generated-in-a-for-loop
   
    #add each date together 
    pcts = pd.concat(pcts)
    
    #arrange by date
    pcts = pcts>>arrange(_.date)
    return pd.DataFrame(pcts)


In [5]:
%%sql

SELECT trip_id,
    calitp_itp_id,
    service_date AS date,
    is_in_service
FROM `cal-itp-data-infra.views.gtfs_schedule_fact_daily_trips`
WHERE (calitp_itp_id=300
    AND service_date >= '2022-05-01'
    AND service_date <= '2022-05-02'
    AND is_in_service = True)
LIMIT 1



Unnamed: 0,trip_id,calitp_itp_id,date,is_in_service
0,882447,300,2022-05-01,True


In [6]:
%%sql
WITH 
rt_trips AS(
SELECT * 
FROM `cal-itp-data-infra-staging.natalie_views.gtfs_rt_distinct_trips`
WHERE date BETWEEN '2022-05-01' AND '2022-05-02'
),
sched_trips AS(
SELECT trip_id,
    calitp_itp_id,
    service_date AS date,
    is_in_service
FROM `cal-itp-data-infra.views.gtfs_schedule_fact_daily_trips`
WHERE (calitp_itp_id=300
    AND service_date >= '2022-05-01'
    AND service_date <= '2022-05-02'
    AND is_in_service = True)
),

joined AS (
SELECT *, 
FROM sched_trips LEFT JOIN rt_trips on(
    sched_trips.trip_id = rt_trips.trip_id AND
    sched_trips.date = rt_trips.date AND
    sched_trips.calitp_itp_id = rt_trips.calitp_itp_id)
    )

SELECT *
FROM joined




Unnamed: 0,trip_id,calitp_itp_id,date,is_in_service,calitp_itp_id_1,calitp_url_number,date_1,trip_id_1,vehicle_id
0,883313,300,2022-05-02,True,300.0,0.0,2022-05-02,883313,1562
1,882460,300,2022-05-01,True,300.0,0.0,2022-05-01,882460,2114
2,882619,300,2022-05-02,True,300.0,0.0,2022-05-02,882619,1502
3,883665,300,2022-05-02,True,,,,,
4,882605,300,2022-05-02,True,300.0,0.0,2022-05-02,882605,1354
...,...,...,...,...,...,...,...,...,...
2285,881852,300,2022-05-02,True,300.0,0.0,2022-05-02,881852,1331
2286,881806,300,2022-05-01,True,300.0,0.0,2022-05-01,881806,1346
2287,881784,300,2022-05-01,True,300.0,0.0,2022-05-01,881784,2109
2288,881800,300,2022-05-01,True,300.0,0.0,2022-05-01,881800,1722


In [7]:
%%sql
#from eric

WITH 
rt_trips AS(
SELECT date AS service_date, trip_id AS vp_trip_id, calitp_itp_id, calitp_url_number
FROM `cal-itp-data-infra-staging.natalie_views.gtfs_rt_distinct_trips`
WHERE date BETWEEN '2022-05-01' AND '2022-05-31'
),
sched_trips AS(
SELECT trip_id, service_date, calitp_itp_id, calitp_url_number
FROM `cal-itp-data-infra.views.gtfs_schedule_fact_daily_trips`
WHERE (calitp_itp_id=300
    AND service_date BETWEEN '2022-05-01' AND '2022-05-31'
    AND is_in_service = True)
),
rt_sched_joined AS(
  SELECT
  T1.calitp_itp_id,
  T1.calitp_url_number,
  T1.service_date,
    COUNT(T1.trip_id) AS num_sched,
    COUNT(T2.vp_trip_id) AS num_rt_vp,
    -- num_rt/num_sched AS pct_ran
    
  FROM sched_trips AS T1
  LEFT JOIN rt_trips AS T2
    ON
      T1.trip_id = T2.vp_trip_id
      AND T1.calitp_itp_id = T2.calitp_itp_id
      AND T1.calitp_url_number = T2.calitp_url_number
      AND T1.service_date = T2.service_date
  GROUP BY 1, 2, 3
),
with_percent AS(
  SELECT *,
  num_rt_vp/num_sched AS pct_ran
  FROM rt_sched_joined
)
SELECT * 
FROM with_percent



Unnamed: 0,calitp_itp_id,calitp_url_number,service_date,num_sched,num_rt_vp,pct_ran
0,300,0,2022-05-20,1564,1466,0.93734
1,300,0,2022-05-12,1619,1483,0.915998
2,300,0,2022-05-06,1561,1392,0.891736
3,300,0,2022-05-19,1625,1476,0.908308
4,300,0,2022-05-02,1519,1069,0.703752
5,300,0,2022-05-11,1617,1482,0.916512
6,300,0,2022-05-14,874,604,0.691076
7,300,0,2022-05-17,1621,1478,0.911783
8,300,0,2022-05-03,1624,1473,0.90702
9,300,0,2022-05-22,770,543,0.705195
