# Create routes and bus stops shapefiles

Create 2 datasets to upload to ArcGIS:
1. every stop + what route at those stops
1. every route, with a line representing the route either from `shapes.txt` or creating one from `stops.txt`

In [1]:
import geopandas as gpd
import pandas as pd
import os

from shared_utils import geography_utils

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)
pd.set_option("display.max_rows", 20)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *



In [2]:
'''
stops = (
    tbl.gtfs_schedule.stops()
    >> select(_.calitp_itp_id, _.stop_id, 
              _.stop_lat, _.stop_lon, 
              _.stop_name, _.stop_code
             )
    >> distinct()
    >> collect()
).to_parquet("./stops.parquet")


trips = (
    tbl.gtfs_schedule.trips()
    >> select(_.calitp_itp_id, _.route_id, _.shape_id)
    >> distinct()
    >> collect()
).to_parquet("./trips.parquet")


route_info = (
    tbl.gtfs_schedule.routes()
    # NB/SB may share same route_id, but different short/long names
    >> select(_.calitp_itp_id, _.route_id, 
              _.route_short_name, _.route_long_name)
    >> distinct()
    >> collect()
).to_parquet("./route_info.parquet")


agencies = (
    tbl.gtfs_schedule.agency()
    >> select(_.calitp_itp_id, _.agency_id, _.agency_name)
    >> distinct()
    >> collect()
).to_parquet("./agencies.parquet")
'''

stops = pd.read_parquet("./stops.parquet")
trips = pd.read_parquet("./trips.parquet")
route_info = pd.read_parquet("./route_info.parquet")
agencies = pd.read_parquet("./agencies.parquet")
routes = gpd.read_parquet("./routes.parquet")

## Routes

Create a routes shapefile with line geometry, each row representing a route for an operator.

* Link `shape_id` column back to route identifiers (route_id, route_name).
* Link `calitp_itp_id` back to operator name.

Traffic Signals - Arc gdb with all traffic signals maintained by Caltrans. They want to know which signals should have preemption, which % that should have it have it already enabled.

### Add operator-routes that exist in `shapes.txt`

In [3]:
# Define function to attach route_info
def attach_route_name(df, route_info_df):
    # Attach route info from gtfs_schedule.routes, using route_id
    routes = pd.merge(
        df, 
        route_info_df,
        on = ["calitp_itp_id", "route_id"],
        # None that are left_only
        how = "inner",
        # route_id can have multiple long/short names
        validate = "m:m",
    )

    return routes

In [4]:
'''
# Use geography_utils to assemble routes from shapes.txt
ITP_ID_LIST = list(agencies.calitp_itp_id.unique())

routes = (geography_utils.make_routes_shapefile(
            ITP_ID_LIST, CRS = geography_utils.WGS84, 
            alternate_df=None)
          .to_parquet("./routes.parquet")
         )
'''

'\n# Use geography_utils to assemble routes from shapes.txt\nITP_ID_LIST = list(agencies.calitp_itp_id.unique())\n\nroutes = (geography_utils.make_routes_shapefile(\n            ITP_ID_LIST, CRS = geography_utils.WGS84, \n            alternate_df=None)\n          .to_parquet("./routes.parquet")\n         )\n'

In [5]:
routes1 = pd.merge(
    routes,
    trips,
    on = ["calitp_itp_id", "shape_id"],
    # There are shape_ids that are left_only (1,600 obs)
    how = "inner",
    validate = "1:m",
)


routes_part1 = attach_route_name(routes1, route_info)
routes_part1.head(2)

Unnamed: 0,calitp_itp_id,shape_id,geometry,route_id,route_short_name,route_long_name
0,0,25224,"LINESTRING (-118.47129 33.98836, -118.47160 33...",3304,1,Main St & Santa Monica Blvd/UCLA
1,0,25225,"LINESTRING (-118.47129 33.98836, -118.47160 33...",3304,1,Main St & Santa Monica Blvd/UCLA


### Add operator-routes that aren't found in `shapes.txt`
For agencies that don't publish `shapes.txt`, go to their `stops.txt`, string together stop sequences and draw a line through it.

What's the difference between `list(missing_trips.calitp_itp_id.unique())` and `missing_trips.calitp_itp_id.unique().tolist()`?

`list(missing_trips.calitp_itp_id.unique())` doesn't work with the loop.


In [6]:
# Use `trips` table to merge `shape_id` to `route_id`

## Stops

Create a stops shapefile with point geometry, each row representing a stop for an operator-route.

* Keep `lat`, `lon` columns as numeric, as well as `geometry` column
* Link `stop_id` column back to stop identifiers (route_id, route_name, stop_code, stop_name).
* Link calitp_itp_id back to operator name.

In [7]:
'''
stops = pd.read_parquet("./stops.parquet")

stops = geography_utils.create_point_geometry(stops, 
                        longitude_col = "stop_lon", 
                        latitude_col = "stop_lat", 
                        crs = geography_utils.WGS84
                        )

# There are a couple of duplicates when looking at ID-stop_id (but diff stop_code)
# Drop these, since stop_id is used to merge with route_id
stops = (stops
         .sort_values(["calitp_itp_id", "stop_id", "stop_code"])
         .drop_duplicates(subset=["calitp_itp_id", "stop_id"])
         .reset_index(drop=True)
)

stops.to_parquet("./stops_with_geom.parquet")
'''

stops = gpd.read_parquet("./stops_with_geom.parquet")
stops.head(2)

Unnamed: 0,calitp_itp_id,stop_id,stop_lat,stop_lon,stop_name,stop_code,geometry
0,0,1017,34.051842,-118.384115,ROBERTSON NB & CASHIO NS,2474,POINT (-118.38411 34.05184)
1,0,1021,34.031395,-118.396278,NATIONAL WB & BAGLEY FS,2477,POINT (-118.39628 34.03140)


In [8]:
'''
stops_with_route = (
    tbl.gtfs_schedule.stop_times()    
    >> select(_.calitp_itp_id, _.stop_id, _.trip_id)
    # join on trips table using trip_id to get route_id
    >> inner_join(_, 
                  (tbl.gtfs_schedule.trips()
                   >> select(_.calitp_itp_id, _.route_id, _.trip_id)
                  ),
                  ["calitp_itp_id", "trip_id"]
                 )
    # Keep stop_id and route_id, no longer need trip info
    >> select(_.calitp_itp_id, _.stop_id, _.route_id)
    >> distinct()
    >> collect()
)
stops_with_route.to_parquet("./stops_with_route_id")

'''
stops_with_route = pd.read_parquet("./stops_with_route_id")

stops_with_route.head(2)

Unnamed: 0,calitp_itp_id,stop_id,route_id
0,260,260,BCT109 SB
1,120,247,3


In [9]:
stops_with_route2 = pd.merge(
    stops,
    stops_with_route,
    on = ["calitp_itp_id", "stop_id"],
    # About 6,000 rows that are left_only (stop_id) not linked with route
    # Drop these, we want full information
    how = "inner",
    validate = "1:m",
)

stops_with_route2.head(2)


Unnamed: 0,calitp_itp_id,stop_id,stop_lat,stop_lon,stop_name,stop_code,geometry,route_id
0,0,1017,34.051842,-118.384115,ROBERTSON NB & CASHIO NS,2474,POINT (-118.38411 34.05184),3310
1,0,1021,34.031395,-118.396278,NATIONAL WB & BAGLEY FS,2477,POINT (-118.39628 34.03140),3318


In [10]:
stops_with_route3 = pd.merge(
    stops_with_route2,
    route_info,
    on = ["calitp_itp_id", "route_id"],
    # 121 obs that are left_only , keep these (have route_id, but no names associated)
    how = "left",
    # many on left is expected
    # many on right is because there are some route_ids with different long/short names
    validate = "m:m",
)

stops_with_route3.head(2)

Unnamed: 0,calitp_itp_id,stop_id,stop_lat,stop_lon,stop_name,stop_code,geometry,route_id,route_short_name,route_long_name
0,0,1017,34.051842,-118.384115,ROBERTSON NB & CASHIO NS,2474,POINT (-118.38411 34.05184),3310,7,Pico Blvd
1,0,1021,34.031395,-118.396278,NATIONAL WB & BAGLEY FS,2477,POINT (-118.39628 34.03140),3318,17,Culver City Sta - UCLA


In [11]:
agencies = pd.read_parquet("./agencies.parquet")
agencies.head(2)

Unnamed: 0,calitp_itp_id,agency_id,agency_name
0,8,Monterey-Salinas Transit,Monterey-Salinas Transit
1,208,Monterey-Salinas Transit,Monterey-Salinas Transit


In [12]:
# Turn df from long, and condense values into list
# They'll want to look at stops by ID, but see the list of agencies it's associated with
agencies2 = (agencies.groupby("calitp_itp_id")
             .agg(pd.Series.tolist)
             .reset_index()
             .rename(columns = {
                 "agency_id": "agency_id_list",
                 "agency_name": "agency_name_list",
             })
            )

agencies2.head(2)

Unnamed: 0,calitp_itp_id,agency_id_list,agency_name_list
0,0,[6216179],[Big Blue Bus]
1,1,"[Long Beach Transit, 1, 36, FS, 78, 261]","[Long Beach Transit, Glendale Beeline, Fairfie..."


In [13]:
stops_with_route4 = pd.merge(
    stops_with_route3,
    agencies2,
    on = "calitp_itp_id",
    how = "left",
    validate = "m:1",
)

stops_with_route4.head(2)

Unnamed: 0,calitp_itp_id,stop_id,stop_lat,stop_lon,stop_name,stop_code,geometry,route_id,route_short_name,route_long_name,agency_id_list,agency_name_list
0,0,1017,34.051842,-118.384115,ROBERTSON NB & CASHIO NS,2474,POINT (-118.38411 34.05184),3310,7,Pico Blvd,[6216179],[Big Blue Bus]
1,0,1021,34.031395,-118.396278,NATIONAL WB & BAGLEY FS,2477,POINT (-118.39628 34.03140),3318,17,Culver City Sta - UCLA,[6216179],[Big Blue Bus]
