In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
import calitp.magics
import branca

import shared_utils
from utils import *

from siuba import *
import pandas as pd
import geopandas as gpd
import shapely

import datetime as dt
import time
from zoneinfo import ZoneInfo

import rt_analysis as rt
import importlib

import gcsfs
fs = gcsfs.GCSFileSystem()

from tqdm import tqdm_notebook
from tqdm.notebook import trange, tqdm



In [2]:
# new = query_sql('''
# SELECT * FROM `cal-itp-data-infra.gtfs_rt.vehicle_positions`
# WHERE calitp_itp_id = 170 LIMIT 1000
# ''')

In [3]:
# %%sql -o new
# SELECT calitp_itp_id, calitp_url_number,
# header.timestamp AS header_timestamp, vehicle.timestamp AS vehicle_timestamp,
# vehicle.vehicle.label AS entity_id, vehicle.vehicle.id AS vehicle_id,
# vehicle.trip.tripId AS vehicle_trip_id, vehicle.position.longitude AS vehicle_longitude,
# vehicle.position.latitude AS vehicle_latitude
# FROM `cal-itp-data-infra.gtfs_rt.vehicle_positions`
# WHERE calitp_itp_id = 170 AND vehicle.timestamp > 1644393600 AND vehicle.timestamp < 1644487200

In [4]:
analysis_date = dt.date(2022, 2, 9)

In [5]:
def get_vehicle_positions(itp_id, analysis_date):
    ''' 
    itp_id: an itp_id (string or integer)
    analysis_date: datetime.date
    
    Interim function for getting complete vehicle positions data for a single operator on a single date of interest.
    To be replaced as RT views are implemented...
    
    Currently drops positions for day after analysis date after 2AM, temporary fix to balance capturing trips crossing
    midnight with avoiding duplicates...
    '''

    
    next_date = analysis_date + dt.timedelta(days = 1)
    date_str = analysis_date.strftime('%Y-%m-%d')
    
    start = dt.datetime.combine(analysis_date, dt.time(0))
    start_ts = int(start.timestamp())
    end = start + dt.timedelta(days = 1, seconds = 2 * 60**2)
    end_ts = int(end.timestamp())
    
    filename = f'vp_{itp_id}_{date_str}.parquet'
    path = check_cached(filename)
    if path:
        print('found parquet')
        return pd.read_parquet(path)
    else:
        df = query_sql(f"""
        SELECT calitp_itp_id, calitp_url_number,
        header.timestamp AS header_timestamp, vehicle.timestamp AS vehicle_timestamp,
        vehicle.vehicle.label AS entity_id, vehicle.vehicle.id AS vehicle_id,
        vehicle.trip.tripId AS trip_id, vehicle.position.longitude AS vehicle_longitude,
        vehicle.position.latitude AS vehicle_latitude
        FROM `cal-itp-data-infra.gtfs_rt.vehicle_positions`
        WHERE calitp_itp_id = {itp_id} AND vehicle.timestamp > {start_ts} AND vehicle.timestamp < {end_ts}
        """)
        
        df = df >> distinct(_.vehicle_trip_id, _.vehicle_timestamp, _keep_all=True)
        df = df.dropna(subset=['vehicle_timestamp'])
        assert not df.empty, f'no vehicle positions data found for {date_str}'
        df.vehicle_timestamp = df.vehicle_timestamp.apply(convert_ts)
        df.header_timestamp = df.header_timestamp.apply(convert_ts)

        # assert df.vehicle_timestamp.min() < dt.datetime.combine(analysis_date, dt.time(0)), 'rt data starts after analysis date'
        # assert dt.datetime.combine(analysis_date, dt.time(hour=23, minute=59)) < df.vehicle_timestamp.max(), 'rt data ends early on analysis date'
        # if not df.vehicle_timestamp.min() < dt.datetime.combine(analysis_date, dt.time(0)):
        #     warnings.warn('rt data starts after analysis date')
        # if not dt.datetime.combine(end) < df.vehicle_timestamp.max():
        #     warnings.warn('rt data ends early on analysis date')

        df.to_parquet(f'{GCS_FILE_PATH}cached_views/{filename}')
        return df

In [6]:
# lbt_test = get_vehicle_positions(170, analysis_date)

In [7]:
pbar = tqdm()

0it [00:00, ?it/s]

In [9]:
# lbt = rt.OperatorDayAnalysis(170, analysis_date, pbar)

In [11]:
lbt_vp = get_vehicle_positions(170, analysis_date)

found parquet


In [12]:
lbt_vp

Unnamed: 0,calitp_itp_id,calitp_url_number,header_timestamp,vehicle_timestamp,entity_id,vehicle_id,trip_id,vehicle_longitude,vehicle_latitude
0,170,0,2022-02-09 11:45:43,2022-02-09 11:44:45,2507,,9370795,-118.122270,33.782166
1,170,0,2022-02-09 07:15:07,2022-02-09 07:13:42,2423,,9371534,-118.172030,33.782543
2,170,0,2022-02-09 07:15:07,2022-02-09 07:14:11,2709,,9370822,-118.122210,33.782140
3,170,0,2022-02-09 07:15:07,2022-02-09 07:14:01,2915,,9371535,-118.192520,33.781410
4,170,0,2022-02-09 07:15:07,2022-02-09 07:14:04,2922,,9371542,-118.200630,33.770638
...,...,...,...,...,...,...,...,...,...
118604,170,0,2022-02-09 23:00:27,2022-02-09 23:00:00,2918,,9369882,-118.114760,33.781967
118605,170,0,2022-02-09 19:25:23,2022-02-09 19:24:44,2918,,9369969,-118.176506,33.775425
118606,170,0,2022-02-09 20:16:54,2022-02-09 20:16:05,2918,,9369862,-118.119446,33.782080
118607,170,0,2022-02-09 19:05:52,2022-02-09 19:05:01,2918,,9369969,-118.115220,33.778023
