# Refactor bus corridors

In [1]:
import datetime as dt
import geopandas as gpd
import numpy as np
import pandas as pd

from siuba import *

import B1_bus_corridors as bus_corridors
from A1_rail_ferry_brt import analysis_date
from utilities import GCS_FILE_PATH
from shared_utils import rt_utils

import dask.dataframe as dd
import dask_geopandas
import dask_utils

date_str = analysis_date.strftime(rt_utils.FULL_DATE_FMT)



## Debug ValueError

`merged = merge_routes_to_trips(routelines, trips)` throwing error

`ValueError: You are trying to merge on object and int32 columns. If you wish to proceed you should use pd.concat (one of them is empty)`

In [None]:
itp_id = 323

FILE_PATH = f"{rt_utils.GCS_FILE_PATH}cached_views/"
        
routelines = dask_geopandas.read_parquet(f"{FILE_PATH}routelines_{itp_id}_{date_str}.parquet")
trips = dd.read_parquet(f"{FILE_PATH}trips_{itp_id}_{date_str}.parquet")
stop_times = dd.read_parquet(f"{FILE_PATH}st_{itp_id}_{date_str}.parquet")
stops = dask_geopandas.read_parquet(f"{FILE_PATH}stops_{itp_id}_{date_str}.parquet")

#gdf = bus_corridors.single_operator_hqta(routelines, trips, stop_times, stops)
#gdf.to_parquet("./data/182_bus2.parquet")


## Debug FileNotFound error

`routelines` not found. Confirmed. Not in GCS.

## Debug utilities.create_segment error -- taking too long

This is Amtrak. Already excluded by Eric.

In [2]:
from calitp.tables import tbl

In [3]:
ITP_IDS = (tbl.gtfs_schedule.agency()
           >> distinct(_.calitp_itp_id)
           >> filter(_.calitp_itp_id != 200)
           >> collect()
).calitp_itp_id.tolist()

In [4]:
CAPTURED_IDS = (bus_corridors.VALUE_ERROR_IDS + 
                bus_corridors.FILE_NOT_FOUND_IDS + 
                bus_corridors.TOO_LONG_IDS + 
                bus_corridors.ITP_IDS_IN_GCS
               )

In [5]:
set(ITP_IDS).difference(set(CAPTURED_IDS))

{481, 485}

In [10]:
(tbl.gtfs_schedule.agency()
 >>filter((_.calitp_itp_id == 481) | (_.calitp_itp_id == 485) )
)

Unnamed: 0,calitp_itp_id,calitp_url_number,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email,calitp_extracted_at,calitp_hash,agency_key
0,485,0,TF,Treasure Island Ferry,https://tisf.com/,America/Los_Angeles,en,,,,2022-06-27,ApApxK5gs7GnGYMR4BiIHA==,1882703198881820869
1,481,0,SS,City of South San Francisco,http://www.ssf.net/SCS,Canada/Pacific,en,650-877-8550,,shuttle@ssf.net,2022-06-27,Afi4nrdkjauvRqasmm0aGA==,-5033576204119143470


In [6]:
set(CAPTURED_IDS).difference(set(ITP_IDS))

{203}

In [None]:
#https://stackoverflow.com/questions/71688126/groupby-map-partitions-in-dask
trips_by_stop_hour = stop_times.map_partitions(
    lambda df: df.groupby(["calitp_itp_id", "stop_id"], as_index=False)
        .agg({"trip_id": "count"}), meta={"calitp_itp_id": int,
                                          "stop_id": str,
                                          "trip_id": int
                                         }
)

In [None]:
# calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/182_bus.parquet
eric = gpd.read_parquet(f"{GCS_FILE_PATH}bus_corridors/182_bus.parquet")

In [None]:
tiff = gpd.read_parquet(f"./data/182_bus2.parquet")

In [None]:
def stats(df):
    print(df.columns)
    print(df.dtypes)
    print(f"# obs: {len(df)}")
    print(f"# unique stops: {df.stop_id.nunique()}")
    print(f"sum am_max stops: {df.am_max_trips.sum()}")
    print(f"sum pm_max stops: {df.am_max_trips.sum()}")
    print(f"# unique hqta_segment_id: {df.hqta_segment_id.nunique()}")

    
def compare(eric, tiff):
    print("***********Eric************")
    stats(eric)
    print("***********Tiff************")
    stats(tiff)

In [None]:
compare(eric, tiff)

In [None]:
keep_cols = ["stop_id", "am_max_trips", "pm_max_trips"]
eric2 = eric[keep_cols].drop_duplicates()
tiff2 = tiff[keep_cols].drop_duplicates()

In [None]:
m1 = pd.merge(eric2, tiff2,
         on = "stop_id",
         how = "outer",
         validate = "1:1",
         indicator=True
)

m1._merge.value_counts()

In [None]:
# For the ones that are in both, generated the same number of trips
m1[m1._merge=="both"][m1.am_max_trips_x != m1.am_max_trips_y]

In [None]:
m1[m1._merge=="both"][m1.pm_max_trips_x != m1.pm_max_trips_y]

In [None]:
# Remove trips that only run during AM/PM peak, because those don't qualify as HQTA
def invalid_trips_only_peak_hours(df):
    trip_cols = ["calitp_itp_id", "trip_id"]
    
    df = df.assign(
        is_am_peak = df.time_of_day.map(lambda x: 1 if x=="AM Peak" 
                                          else 0),
        is_pm_peak = df.time_of_day.map(lambda x: 1 if x=="PM Peak"
                                          else 0),
        is_other = df.time_of_day.map(lambda x: 1 if ((x != "AM Peak") and 
                                                        (x != "PM Peak"))
                                        else 0),
    )
    
    df2 = (df.groupby(trip_cols)
           .agg({"is_am_peak": np.max, 
                 "is_pm_peak": np.max,
                 "is_other": np.max,})
           .reset_index()
          )
    
    # Drop trips that only run AM/PM
    only_peak = df2[(df2.is_am_peak==1) & 
              (df2.is_pm_peak==1) & 
              (df2.is_other==0)].trip_id.compute()
        
    # Return list of trip_ids that are invalid
    return list(only_peak.unique())

In [None]:
invalid_trips = invalid_trips_only_peak_hours(trip_stops_by_hour)