In [1]:
!pip install sqlalchemy-bigquery



In [2]:
import geopandas as gpd
import pandas as pd
import glob
import os

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

import prep_data2 as prep_data

DATA_PATH = "./data/test/"

interfere with sqlalchemy_bigquery.
pybigquery should be uninstalled.
  return _bootstrap._gcd_import(name[level:], package, level)
E0406 00:18:24.016299705     241 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


In [3]:
from datetime import datetime

time0 = datetime.now()

# Read in local parquets
stops = pd.read_parquet(f"{DATA_PATH}stops.parquet")
trips = pd.read_parquet(f"{DATA_PATH}trips.parquet")
route_info = pd.read_parquet(f"{DATA_PATH}route_info.parquet")
routes = gpd.read_parquet(f"{DATA_PATH}routes.parquet")
latest_itp_id = pd.read_parquet(f"{DATA_PATH}latest_itp_id.parquet")

time1 = datetime.now()
print(f"Read in data: {time1-time0}")

Read in data: 0:00:01.541914


In [4]:
# Left only means in trips, but shape_id not found in shapes.txt
# right only means in routes, but no route that has that shape_id 
# We probably should keep how = "left"?
# left only means we can assemble from stop sequence?
routes1 = pd.merge(
        trips,
        routes,
        on = ["calitp_itp_id", "shape_id"],
        how = "left",
        validate = "m:1",
        indicator=True
    )


In [5]:
routes1._merge.value_counts()

both          7832
left_only      150
right_only       0
Name: _merge, dtype: int64

In [6]:
missing_shapes = (routes1[routes1._merge=="left_only"]
      .drop(columns = ["geometry", "_merge"])
      .reset_index(drop=True)
     )

missing_shapes.head(2)

Unnamed: 0,calitp_itp_id,route_id,shape_id
0,301,8,1232_shp
1,301,6,1205_shp


In [7]:
SELECTED_DATE = "2022-3-15"

trip_cols = ["calitp_itp_id", "route_id", "shape_id"]

dim_trips = (tbl.views.gtfs_schedule_dim_trips()
             # filter first to just the smaller set of IDs in missing_shapes
             >> filter(_.calitp_itp_id.isin(missing_shapes.calitp_itp_id))
             # Now find those shape_ids and trips associated
             >> filter(_.shape_id.isin(missing_shapes.shape_id))
             >> select(*trip_cols, _.trip_key)
             >> distinct()
            )

missing_trips = (
    tbl.views.gtfs_schedule_fact_daily_trips()
    >> filter(_.service_date == SELECTED_DATE, 
           _.is_in_service==True)
    >> select(_.trip_key, _.trip_id)
    >> inner_join(_, dim_trips, on = "trip_key")
    >> distinct()
    >> collect()
)


In [8]:
# Since there are multiple trips, we'll sort the same way, and keep the first one
group_cols = ["calitp_itp_id", "route_id", "shape_id"]
missing_trips2 = (missing_trips.sort_values(group_cols + ["trip_id"])
                  .drop_duplicates(subset=group_cols)
                  .reset_index(drop=True)
)

In [9]:
len(missing_trips2)

104

In [10]:
stop_info_trips = (
    tbl.views.gtfs_schedule_dim_stop_times()
    >> filter(_.calitp_itp_id.isin(missing_trips2.calitp_itp_id))
    >> filter(_.trip_id.isin(missing_trips2.trip_id))
    >> distinct()
    >> inner_join(_,
                  tbl.views.gtfs_schedule_dim_stops(), 
                  on = ["calitp_itp_id", "stop_id"])
    >> select(_.calitp_itp_id, _.trip_id, 
              _.stop_id, _.stop_sequence,
              _.stop_lon, _.stop_lat)
    >> distinct()
    >> collect()
    # Want to merge back route_id on, but need to collect first
    >> inner_join(_, missing_trips2)
)

In [11]:
stop_info_trips.head(2)

Unnamed: 0,calitp_itp_id,trip_id,stop_id,stop_sequence,stop_lon,stop_lat,trip_key,route_id,shape_id
0,296,3762615-202220D-vs20222D-Weekday-09,1535R,28,-122.084068,37.071644,-8946882807225820515,35,350082
1,296,3762615-202220D-vs20222D-Weekday-09,1239R,42,-122.1409,37.135456,-8946882807225820515,35,350082


In [12]:
# Somehow, getting back some multiple points for same trip_id, stop_id
group_cols = ["calitp_itp_id", "trip_id", "stop_id"]
stop_info_trips = (stop_info_trips.sort_values(group_cols)
                   .drop_duplicates(subset=group_cols)
                   .reset_index(drop=True)
                   .assign(calitp_url_number=0)
                  )

In [13]:
len(stop_info_trips)

9471

In [14]:
import utils

missing_routes = utils.make_routes_line_geom_for_missing_shapes(stop_info_trips)

In [15]:
stop_info_trips.head()

Unnamed: 0,calitp_itp_id,trip_id,stop_id,stop_sequence,stop_lon,stop_lat,trip_key,route_id,shape_id,calitp_url_number,geometry
0,4,10001020,1430,5,-122.078886,37.674847,2608222843436200649,18,shp-18-18,0,POINT (-122.07889 37.67485)
1,4,10001020,1460,6,-122.077416,37.676528,2608222843436200649,18,shp-18-18,0,POINT (-122.07742 37.67653)
2,4,10001020,1461,7,-122.075464,37.677914,2608222843436200649,18,shp-18-18,0,POINT (-122.07546 37.67791)
3,4,10001020,1463,8,-122.07415,37.678469,2608222843436200649,18,shp-18-18,0,POINT (-122.07415 37.67847)
4,4,10001020,1465,9,-122.072285,37.679264,2608222843436200649,18,shp-18-18,0,POINT (-122.07228 37.67926)


In [16]:
print(len(missing_routes))
missing_routes.head()

104


Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,geometry
0,4,0,shp-18-18,"LINESTRING (-122.08695 37.66989, -122.26581 37..."
1,4,0,shp-21-13,"LINESTRING (-122.08709 37.67010, -122.21236 37..."
2,4,0,shp-215-67,"LINESTRING (-121.97561 37.55791, -122.29511 37..."
3,4,0,shp-239-57,"LINESTRING (-122.08650 37.66972, -121.97549 37..."
4,4,0,shp-28-09,"LINESTRING (-122.08695 37.66989, -122.08695 37..."


In [17]:
# Merge route_id back in, which is lost when it passes through make_routes_line_geom_for_missing_shapes
# Also, get rid of calitp_url_number
missing_routes2 = pd.merge(
    missing_routes.drop(columns="calitp_url_number"),
    stop_info_trips[["calitp_itp_id", "shape_id", "route_id"]].drop_duplicates(),
    on = ["calitp_itp_id", "shape_id"],
    how = "inner",
    validate = "1:1",
)

In [18]:
routes_part2 = prep_data.attach_route_name(missing_routes2, route_info)
routes_part2.head()

Unnamed: 0,calitp_itp_id,shape_id,geometry,route_id,route_short_name,route_long_name
0,4,shp-18-18,"LINESTRING (-122.08695 37.66989, -122.26581 37...",18,18,Solano - Shattuck - MLK Jr.
1,4,shp-21-13,"LINESTRING (-122.08709 37.67010, -122.21236 37...",21,21,Dimond - Fruitvale - Bay Farm
2,4,shp-215-67,"LINESTRING (-121.97561 37.55791, -122.29511 37...",215,215,Osgood - Warm Springs - Landing
3,4,shp-239-57,"LINESTRING (-122.08650 37.66972, -121.97549 37...",239,239,Grimmer - Warm Springs
4,4,shp-28-09,"LINESTRING (-122.08695 37.66989, -122.08695 37...",28,28,Alvarado - Castro Valley - B St.
