# Debug transit routes on SHN

There are some routes that aren't in `shapes.txt`, or, they don't get picked up when we merge `trips` and `dim_trips`.

Foothill Transit is one example (id 112).

It has routes that run on SHN, but don't show up in the service increase estimator, because it probably doesn't have `shapes` info.

Bigger question is: `create_routes_data` creates it from `gtfs_schedule`, which is the latest, which is what Hunter wants, esp if it's going to be pushed to open data portal.

But, sometimes we want to assemble the routes shapefile for a selected date, should we allow for that? If it's not a date in the past, then just proceed with `gtfs_schedule.trips`. 

Issue is: `trips` allows us to select for a service date, but Foothill Transit doesn't appear there with `trip_key`, but `dim_trips`, which doesn't allow for `service_date` selection, does have `trip_key` for Foothill Transit.

In [1]:
import branca
import datetime as dt
import geopandas as gpd
import intake
import os
import pandas as pd

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp_data_analysis.tables import tbls
from calitp_data_analysis.sql import query_sql
from siuba import *

import create_parallel_corridors
from bus_service_utils import utils
from calitp_data_analysis import portfolio_utils
from calitp_data_analysis import calitp_color_palette as cp

IMG_PATH = create_parallel_corridors.IMG_PATH
DATA_PATH = create_parallel_corridors.DATA_PATH

catalog = intake.open_catalog("*.yml")



In [2]:
SELECTED_DATE = dt.date(2022, 1, 6)
ITP_ID = 112

In [3]:
trips = (tbl.views.gtfs_schedule_fact_daily_trips()
     >> filter(_.service_date == SELECTED_DATE)
    >> filter(_.calitp_itp_id == ITP_ID)
     #>> select(_.calitp_itp_id, _.service_date, 
     #          _.route_id, _.trip_key, _.trip_id)
         >> collect()
        )

#dim_trips = (tbl.views.gtfs_schedule_dim_trips()
#             >> filter(_.calitp_itp_id==ITP_ID)
         #>> select(_.calitp_itp_id, _.route_id, 
         #          _.shape_id, _.trip_key, _.trip_id)
#             >> collect()
#            )
'''
shapes = (trips 
      >> inner_join(_, dim_trips, 
                    on = ["calitp_itp_id", "trip_key", "route_id", "trip_id"]
                   )
      >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.trip_id)
      >> distinct()
      >> collect()
     )
'''



'\nshapes = (trips \n      >> inner_join(_, dim_trips, \n                    on = ["calitp_itp_id", "trip_key", "route_id", "trip_id"]\n                   )\n      >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.trip_id)\n      >> distinct()\n      >> collect()\n     )\n'

In [5]:
trips

Unnamed: 0,feed_key,trip_key,trip_id,route_id,calitp_itp_id,calitp_url_number,service_id,service_date,service_indicator,service_start_date,...,service_inclusion,service_exclusion,is_in_service,calitp_extracted_at,calitp_deleted_at,n_stops,n_stop_times,trip_first_departure_ts,trip_last_arrival_ts,service_hours


In [4]:
trips.service_date.max()

nan

In [None]:
dim_trips.columns

In [None]:
full_routes = gpd.read_parquet("../traffic_ops/data/routes_assembled.parquet")

In [None]:
foothill = full_routes[full_routes.itp_id==112][["itp_id", "shape_id", "route_id"]]

In [None]:
t1.service_date.max()

In [None]:
pd.merge(t1, foothill, 
         left_on = ["calitp_itp_id", "route_id"],
         right_on = ["itp_id", "route_id"]
         )

In [None]:
t1 = trips >> collect()

In [None]:
t1.service_date.min()

In [None]:
t2 = dim_trips >> collect()

In [None]:
pd.merge(t1, t2, 
         on = ["calitp_itp_id", "route_id", "trip_key", "trip_id"])

In [None]:
'''
# Need route_ids for parallel corridors
# Add this info on and use alternate_df in make_analysis_data()
SELECTED_DATE = dt.date(2022, 1, 6)

trips = (tbl.views.gtfs_schedule_fact_daily_trips()
         >> filter(_.service_date == SELECTED_DATE, _.is_in_service == True)
         >> select(_.calitp_itp_id, _.service_date, 
                   _.route_id, _.trip_key)
)

dim_trips = (tbl.views.gtfs_schedule_dim_trips()
             >> select(_.calitp_itp_id, _.route_id, 
                       _.shape_id, _.trip_key)
)

shapes = (trips 
          >> inner_join(_, dim_trips, 
                        on = ["calitp_itp_id", "trip_key", "route_id"]
                       )
          >> select(_.calitp_itp_id, _.route_id, _.shape_id)
          >> distinct()
          >> collect()
         )
'''

In [None]:
'''
transit_routes = catalog.transit_routes.read()

df = pd.merge(transit_routes,
              shapes,
              on = ["calitp_itp_id", "shape_id"],
              # Outer join shows there are left_only and right_only obs
              # But, can only do stuff with full info
              how = "inner",
              # There are some obs where same shape_id is linked to multiple route_id
              # Allow for 1:m
              validate = "1:m",
).rename(columns = {"calitp_itp_id": "itp_id"})
'''

In [None]:
'''
create_parallel_corridors.make_analysis_data(
    hwy_buffer_feet=50, alternate_df = df,
    pct_route_threshold = 0.3, pct_highway_threshold = 0.1,
    DATA_PATH = DATA_PATH, FILE_NAME = "routes_on_shn"
)
'''

In [None]:
transit_routes = catalog.transit_routes.read()


In [None]:
gdf = gpd.read_parquet("./data/transit_routes.parquet")

In [None]:
utils.GCS_FILE_PATH

In [None]:
gdf[gdf.itp_id==112]

In [None]:
df = gpd.read_parquet(f"{DATA_PATH}routes_on_shn.parquet")

# Only keep parallel routes
df = df[df.parallel == 1].reset_index(drop=True)

# Draw highways with 250 ft buffer
highways = gpd.read_parquet(f"{DATA_PATH}highways.parquet")

# Bring in service hours increase data
service = (pd.read_parquet(f"{utils.GCS_FILE_PATH}2022_Jan/service_increase.parquet")
           .rename(columns = {"calitp_itp_id": "itp_id"})
          )

Address duplicates.

Duplicates mean same `shape_id`, but multiple `route_id` values.

It's allowed up until now...but should it be allowed for aggregation?
Will it be double-counting? Leave for now...but might need to get rid of duplicates...

In [None]:
check_shapes = ["107"]
check_ids = [194]

df[(df.itp_id.isin(check_ids)) & (df.shape_id.isin(check_shapes))]

In [None]:
service[(service.itp_id.isin(check_ids)) & (service.shape_id.isin(check_shapes))]

In [None]:
service_increase = pd.merge(service, 
                            df, 
                            on = ["itp_id", "shape_id"],
                            how = "outer",
                            validate = "m:m",
                            indicator=True
                           )

service_increase._merge.value_counts()

In [None]:
service_increase = pd.merge(service, 
                            df, 
                            on = ["itp_id", "shape_id"],
                            how = "inner",
                            validate = "m:m",
                           )

In [None]:
print(f"# unique route ids originally: {df.route_id.nunique()}")
print(f"# unique route ids with service hrs info: {service_increase.route_id.nunique()}")

In [None]:
service_increase.columns

In [None]:
sum_cols = ["additional_trips", "service_hrs", 
            "addl_service_hrs", "service_hours_annual", 
            "addl_service_hrs_annual"
           ]
a1 = portfolio_utils.aggregate_by_geography(service_increase,
                                       group_cols = ["itp_id", "day_name", "tract_type"],
                                       sum_cols = sum_cols,
                                      )

In [None]:
# Can't find FootHill Transit (itp_id 112)

In [None]:
service_increase[(service_increase.itp_id==182)].route_id.value_counts()

In [None]:
service_increase[(service_increase.itp_id==182) & 
                 (service_increase.route_id.str.contains("910")) & 
                 (service_increase.day_name=="Thursday")
                ][["itp_id", "day_name", "tract_type", "departure_hour"] + sum_cols]

In [None]:
MAP_ME = {
    182: "LA Metro", 
    294: "SJ Valley Transportation Authority", 
    279: "BART", 
    282: "SF Muni",
    278: "SD Metropolitan Transit System", 
    112: "Foothill Transit",
}

for itp_id, operator in MAP_ME.items():
    subset = df[df.itp_id==itp_id]
    print(f"{itp_id}: {operator}")
    print("**************************************************")
    cols = ["route_id", "total_routes", "Route", "RouteType",
            "County", "District", 
            "pct_route", "pct_highway"
           ]
    display(subset[cols])

## Make map of these parallel routes for CA

Double check that these are as expected before calculating additional trips, buses, capital expenditures, etc

In [None]:
def data_to_plot(df):
    keep_cols = ["itp_id", "route_id", 
                 "Route", "County", "District", "RouteType",
                 "pct_route", "pct_highway", "parallel",
                 "geometry"
                ]
    df = df[keep_cols].reset_index(drop=True)
    df = df.assign(
        geometry = df.geometry.buffer(200).simplify(tolerance=100),
    )

    return df

to_map = data_to_plot(df)
hwy_df = highways

# Set various components for map
hwys_popup_dict = {
    "Route": "Highway Route",
    "RouteType": "Route Type",
    "County": "County"   
}

transit_popup_dict = {
    "itp_id": "Operator ITP ID",
    "route_id": "Route ID",
    "pct_route": "% overlapping route",
    "pct_highway": "% overlapping highway",
}

hwys_color = branca.colormap.StepColormap(
    colors=["black", "gray"],
)

colorscale = branca.colormap.StepColormap(
    colors=[
        cp.CALITP_CATEGORY_BRIGHT_COLORS[0], #blue
        cp.CALITP_CATEGORY_BRIGHT_COLORS[1] # orange
    ],
)


LAYERS_DICT = {
    "Highways": {"df": hwy_df,
        "plot_col": "Route",
        "popup_dict": hwys_popup_dict, 
        "tooltip_dict": hwys_popup_dict,
        "colorscale": hwys_color,
    },
    "Transit Routes": {"df": to_map,
        "plot_col": "parallel",
        "popup_dict": transit_popup_dict, 
        "tooltip_dict": transit_popup_dict,
        "colorscale": colorscale,
    },
}
    
LEGEND_URL = (
    "https://raw.githubusercontent.com/cal-itp/data-analyses/"
    "main/bus_service_increase/"
    "img/legend_intersecting_parallel.png"
)

LEGEND_DICT = {
    "legend_url": LEGEND_URL,
    "legend_bottom": 85,
    "legend_left": 5,
}


fig = map_utils.make_folium_multiple_layers_map(
    LAYERS_DICT,
    fig_width = 700, fig_height = 700, 
    zoom = map_utils.REGION_CENTROIDS["CA"]["zoom"], 
    centroid = map_utils.REGION_CENTROIDS["CA"]["centroid"], 
    title=f"Parallel Transit Routes to SHN",
    legend_dict = LEGEND_DICT
)
    
#display(fig)
#fig.save(f"{IMG_PATH}parallel_{operator_name}.html")
#print(f"{operator_name} map saved")