# Create routes and bus stops shapefiles

Create 2 datasets to upload to ArcGIS:
1. every stop + what route at those stops
1. every route, with a line representing the route either from `shapes.txt` or creating one from `stops.txt`

In [None]:
import geopandas as gpd
import pandas as pd
import os

from shared_utils import geography_utils

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)
pd.set_option("display.max_rows", 20)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

In [None]:
'''
stops = (
    tbl.gtfs_schedule.stops()
    >> select(_.calitp_itp_id, _.stop_id, 
              _.stop_lat, _.stop_lon, 
              _.stop_name, _.stop_code
             )
    >> distinct()
    >> collect()
).to_parquet("./stops.parquet")


trips = (
    tbl.gtfs_schedule.trips()
    >> select(_.calitp_itp_id, _.route_id, _.shape_id)
    >> distinct()
    >> collect()
).to_parquet("./trips.parquet")


route_info = (
    tbl.gtfs_schedule.routes()
    # NB/SB may share same route_id, but different short/long names
    >> select(_.calitp_itp_id, _.route_id, 
              _.route_short_name, _.route_long_name)
    >> distinct()
    >> collect()
).to_parquet("./route_info.parquet")


agencies = (
    tbl.gtfs_schedule.agency()
    >> select(_.calitp_itp_id, _.agency_id, _.agency_name)
    >> distinct()
    >> collect()
).to_parquet("./agencies.parquet")
'''

stops = pd.read_parquet("./stops.parquet")
trips = pd.read_parquet("./trips.parquet")
route_info = pd.read_parquet("./route_info.parquet")
agencies = pd.read_parquet("./agencies.parquet")
routes = gpd.read_parquet("./routes.parquet")

## Routes

Create a routes shapefile with line geometry, each row representing a route for an operator.

* Link `shape_id` column back to route identifiers (route_id, route_name).
* Link `calitp_itp_id` back to operator name.

Traffic Signals - Arc gdb with all traffic signals maintained by Caltrans. They want to know which signals should have preemption, which % that should have it have it already enabled.

### Add operator-routes that exist in `shapes.txt`

In [None]:
# Define function to attach route_info
def attach_route_name(df, route_info_df):
    # Attach route info from gtfs_schedule.routes, using route_id
    routes = pd.merge(
        df, 
        route_info_df,
        on = ["calitp_itp_id", "route_id"],
        # None that are left_only
        how = "inner",
        # route_id can have multiple long/short names
        validate = "m:m",
    )

    return routes

In [None]:
'''
# Use geography_utils to assemble routes from shapes.txt
ITP_ID_LIST = list(agencies.calitp_itp_id.unique())

routes = (geography_utils.make_routes_shapefile(
            ITP_ID_LIST, CRS = geography_utils.WGS84, 
            alternate_df=None)
          .to_parquet("./routes.parquet")
         )
'''

In [None]:
routes1 = pd.merge(
    routes,
    trips,
    on = ["calitp_itp_id", "shape_id"],
    # There are shape_ids that are left_only (1,600 obs)
    how = "inner",
    validate = "1:m",
)


routes_part1 = attach_route_name(routes1, route_info)
routes_part1.head(2)

### Add operator-routes that aren't found in `shapes.txt`
For agencies that don't publish `shapes.txt`, go to their `stops.txt`, string together stop sequences and draw a line through it.

What's the difference between `list(missing_trips.calitp_itp_id.unique())` and `missing_trips.calitp_itp_id.unique().tolist()`?

`list(missing_trips.calitp_itp_id.unique())` doesn't work with the loop.


In [None]:
# Find the stops that aren't in `shapes.txt`
missing_trips = (
    tbl.gtfs_schedule.trips()
    >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.trip_id)
    >> distinct()
    >> collect()
    >> filter(~_.shape_id.isin(routes_part1.shape_id))
)

missing_trips.head(2)

In [None]:
print(f"# operators with missing trips: {missing_trips.calitp_itp_id.nunique()}")
print(f"operators: {list(missing_trips.calitp_itp_id.unique())}")

In [None]:
def grab_missing_stops(ITP_ID, missing_trips_df, stops_df):
    df = (
        tbl.gtfs_schedule.stop_times()
        # Loop through individual operators, then do the join to find those trips
        # until the issue that makes kernel restarts is fixed
        >> filter(_.calitp_itp_id == ITP_ID)
        >> select(_.calitp_itp_id, _.stop_id, _.stop_sequence, _.trip_id)
        # Can't do isin without the collect()
        # But collect() is what is making kernel restart / shutting down notebook
        >> distinct()
        >> collect()
        >> inner_join(_, missing_trips_df, ["calitp_itp_id", "trip_id"])
        >> inner_join(_, stops_df, ["calitp_itp_id", "stop_id"])
    )
    
    return df

In [None]:
LOOP_ME = missing_trips.calitp_itp_id.unique().tolist()
'''
missing_trips_stops = pd.DataFrame()
for ITP_ID in LOOP_ME:
    df = grab_missing_stops(ITP_ID, missing_trips, stops)
    
    missing_trips_stops = (missing_trips_stops.append(df)
                           .sort_values(["calitp_itp_id", "trip_id", "stop_sequence"])
                           .reset_index(drop=True)
                          )

missing_trips_stops.to_parquet("./missing_trips_stops.parquet")    
'''

In [None]:
'''
missing_trips_stops = pd.read_parquet("./missing_trips_stops.parquet")

# Rename colums to match what's used in geography_utils
missing_trips_stops = missing_trips_stops.rename(
    columns = {"stop_lon": "shape_pt_lon", 
              "stop_lat": "shape_pt_lat",
               "stop_sequence": "shape_pt_sequence",
              }
)


missing_trips_stops2 = geography_utils.make_routes_shapefile(
    LOOP_ME, CRS = geography_utils.WGS84, alternate_df=missing_trips_stops)


missing_trips_stops2.to_parquet("./missing_routes.parquet")
'''

missing_trips_stops2 = gpd.read_parquet("./missing_routes.parquet")

In [None]:
routes2 = pd.merge(
    missing_trips_stops2,
    missing_trips_stops[["calitp_itp_id", "route_id", "shape_id"]].drop_duplicates(),
    on = ["calitp_itp_id", "shape_id"],
    how = "inner",
    validate = "1:m",
)

routes_part2 = attach_route_name(routes2, route_info)
routes_part2.head(2)

In [None]:
# Use `trips` table to merge `shape_id` to `route_id`

## Stops

Create a stops shapefile with point geometry, each row representing a stop for an operator-route.

* Keep `lat`, `lon` columns as numeric, as well as `geometry` column
* Link `stop_id` column back to stop identifiers (route_id, route_name, stop_code, stop_name).
* Link calitp_itp_id back to operator name.

In [None]:
'''
stops = pd.read_parquet("./stops.parquet")

stops = geography_utils.create_point_geometry(stops, 
                        longitude_col = "stop_lon", 
                        latitude_col = "stop_lat", 
                        crs = geography_utils.WGS84
                        )

# There are a couple of duplicates when looking at ID-stop_id (but diff stop_code)
# Drop these, since stop_id is used to merge with route_id
stops = (stops
         .sort_values(["calitp_itp_id", "stop_id", "stop_code"])
         .drop_duplicates(subset=["calitp_itp_id", "stop_id"])
         .reset_index(drop=True)
)

stops.to_parquet("./stops_with_geom.parquet")
'''

stops = gpd.read_parquet("./stops_with_geom.parquet")
stops.head(2)

In [None]:
'''
stops_with_route = (
    tbl.gtfs_schedule.stop_times()    
    >> select(_.calitp_itp_id, _.stop_id, _.trip_id)
    # join on trips table using trip_id to get route_id
    >> inner_join(_, 
                  (tbl.gtfs_schedule.trips()
                   >> select(_.calitp_itp_id, _.route_id, _.trip_id)
                  ),
                  ["calitp_itp_id", "trip_id"]
                 )
    # Keep stop_id and route_id, no longer need trip info
    >> select(_.calitp_itp_id, _.stop_id, _.route_id)
    >> distinct()
    >> collect()
)
stops_with_route.to_parquet("./stops_with_route_id")

'''
stops_with_route = pd.read_parquet("./stops_with_route_id")

stops_with_route.head(2)

In [None]:
stops_with_route2 = pd.merge(
    stops,
    stops_with_route,
    on = ["calitp_itp_id", "stop_id"],
    # About 6,000 rows that are left_only (stop_id) not linked with route
    # Drop these, we want full information
    how = "inner",
    validate = "1:m",
)

stops_with_route2.head(2)


In [None]:
stops_with_route3 = pd.merge(
    stops_with_route2,
    route_info,
    on = ["calitp_itp_id", "route_id"],
    # 121 obs that are left_only , keep these (have route_id, but no names associated)
    how = "left",
    # many on left is expected
    # many on right is because there are some route_ids with different long/short names
    validate = "m:m",
)

stops_with_route3.head(2)

In [None]:
agencies = pd.read_parquet("./agencies.parquet")
agencies.head(2)

In [None]:
# Turn df from long, and condense values into list
# They'll want to look at stops by ID, but see the list of agencies it's associated with
agencies2 = (agencies.groupby("calitp_itp_id")
             .agg(pd.Series.tolist)
             .reset_index()
             .rename(columns = {
                 "agency_id": "agency_id_list",
                 "agency_name": "agency_name_list",
             })
            )

agencies2.head(2)

In [None]:
stops_with_route4 = pd.merge(
    stops_with_route3,
    agencies2,
    on = "calitp_itp_id",
    how = "left",
    validate = "m:1",
)

stops_with_route4.head(2)