In [1]:
!pip install sqlalchemy-bigquery

Collecting sqlalchemy-bigquery
  Using cached sqlalchemy_bigquery-1.4.3-py2.py3-none-any.whl (32 kB)
Collecting sqlalchemy<=1.4.27,>=1.2.0
  Using cached SQLAlchemy-1.4.27-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
Installing collected packages: sqlalchemy, sqlalchemy-bigquery
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 1.4.32
    Uninstalling SQLAlchemy-1.4.32:
      Successfully uninstalled SQLAlchemy-1.4.32
Successfully installed sqlalchemy-1.4.27 sqlalchemy-bigquery-1.4.3


In [2]:
import geopandas as gpd
import pandas as pd
import glob
import os

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

import prep_data2 as prep_data

DATA_PATH = "./data/test/"

interfere with sqlalchemy_bigquery.
pybigquery should be uninstalled.
  return _bootstrap._gcd_import(name[level:], package, level)
E0406 17:34:31.874395198     319 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


In [3]:
from datetime import datetime

time0 = datetime.now()

# Read in local parquets
stops = pd.read_parquet(f"{DATA_PATH}stops.parquet")
trips = pd.read_parquet(f"{DATA_PATH}trips.parquet")
route_info = pd.read_parquet(f"{DATA_PATH}route_info.parquet")
routes = gpd.read_parquet(f"{DATA_PATH}routes.parquet")
latest_itp_id = pd.read_parquet(f"{DATA_PATH}latest_itp_id.parquet")

time1 = datetime.now()
print(f"Read in data: {time1-time0}")

Read in data: 0:00:02.251750


In [4]:
# Left only means in trips, but shape_id not found in shapes.txt
# right only means in routes, but no route that has that shape_id 
# We probably should keep how = "left"?
# left only means we can assemble from stop sequence?
routes1 = pd.merge(
        trips,
        routes,
        on = ["calitp_itp_id", "shape_id"],
        how = "left",
        validate = "m:1",
        indicator=True
    )


In [5]:
routes1._merge.value_counts()

both          7482
left_only       51
right_only       0
Name: _merge, dtype: int64

In [6]:
missing_shapes = (routes1[routes1._merge=="left_only"]
      .drop(columns = ["geometry", "_merge"])
      .reset_index(drop=True)
     )

missing_shapes.head(2)

Unnamed: 0,calitp_itp_id,route_id,shape_id
0,323,Ventura County Line,
1,13,37329,


In [7]:
trip_cols = ["calitp_itp_id", "route_id", "shape_id"]

dim_trips = (tbl.views.gtfs_schedule_dim_trips()
             # filter first to just the smaller set of IDs in missing_shapes
             >> filter(_.calitp_itp_id.isin(missing_shapes.calitp_itp_id))
             # Now find those shape_ids and trips associated
             >> filter(_.shape_id.isin(missing_shapes.shape_id))
             >> select(*trip_cols, _.trip_key)
             >> distinct()
            )

missing_trips = (
    tbl.views.gtfs_schedule_fact_daily_trips()
    >> filter(_.service_date == prep_data.SELECTED_DATE, 
           _.is_in_service==True)
    >> select(_.trip_key, _.trip_id)
    >> inner_join(_, dim_trips, on = "trip_key")
    >> distinct()
    >> collect()
)

In [8]:
missing_trips

Unnamed: 0,trip_key,trip_id,calitp_itp_id,route_id,shape_id
0,-2358869420915140874,t4E8-sl2-p11E-rC4,203,24,126_shp
1,-1930744495137898147,t4E6-sl2-p11E-r9E,203,24,126_shp


In [9]:
# Since there are multiple trips, we'll sort the same way, and keep the first one
group_cols = ["calitp_itp_id", "route_id", "shape_id"]
missing_trips2 = (missing_trips.sort_values(group_cols + ["trip_id"])
                  .drop_duplicates(subset=group_cols)
                  .reset_index(drop=True)
)

In [None]:
len(missing_trips2)

In [None]:
stop_info_trips = (
    tbl.views.gtfs_schedule_dim_stop_times()
    >> filter(_.calitp_itp_id.isin(missing_trips2.calitp_itp_id))
    >> filter(_.trip_id.isin(missing_trips2.trip_id))
    >> distinct()
    >> inner_join(_,
                  tbl.views.gtfs_schedule_dim_stops(), 
                  on = ["calitp_itp_id", "stop_id"])
    >> select(_.calitp_itp_id, _.trip_id, 
              _.stop_id, _.stop_sequence,
              _.stop_lon, _.stop_lat)
    >> distinct()
    >> collect()
    # Want to merge back route_id on, but need to collect first
    >> inner_join(_, missing_trips2)
)

In [None]:
stop_info_trips.head(2)

In [None]:
# Somehow, getting back some multiple points for same trip_id, stop_id
group_cols = ["calitp_itp_id", "trip_id", "stop_id"]
stop_info_trips = (stop_info_trips.sort_values(group_cols)
                   .drop_duplicates(subset=group_cols)
                   .reset_index(drop=True)
                   .assign(calitp_url_number=0)
                  )

In [None]:
len(stop_info_trips)

In [None]:
import utils

missing_routes = utils.make_routes_line_geom_for_missing_shapes(stop_info_trips)

In [None]:
stop_info_trips.head()

In [None]:
print(len(missing_routes))
missing_routes.head()

In [None]:
# Merge route_id back in, which is lost when it passes through make_routes_line_geom_for_missing_shapes
# Also, get rid of calitp_url_number
missing_routes2 = pd.merge(
    missing_routes.drop(columns="calitp_url_number"),
    stop_info_trips[["calitp_itp_id", "shape_id", "route_id"]].drop_duplicates(),
    on = ["calitp_itp_id", "shape_id"],
    how = "inner",
    validate = "1:1",
)

In [None]:
routes_part2 = prep_data.attach_route_name(missing_routes2, route_info)
routes_part2.head()