In [5]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona
import datetime as dt

#import utils

import calitp
from calitp.tables import tbl
from siuba import *

# Replace get_recent_dates()
# Explicitly set dates

dates = {
    'thurs': dt.date(2021, 10, 7),
    'sat': dt.date(2021, 10, 9),
    'sun': dt.date(2021, 10, 10)
}

min_date = min(dates.values())
max_date = max(dates.values())

In [6]:
# store temporarily here
DATA_PATH = "gs://calitp-analytics-data/data-analyses/bus_service_increase/test/"
#thurs = pd.read_parquet(f"{DATA_PATH}trips_thurs.parquet")
#sat = pd.read_parquet(f"{DATA_PATH}trips_sat.parquet")
#sun = pd.read_parquet(f"{DATA_PATH}trips_sun.parquet")

#df = pd.concat([thurs, sat, sun], axis=0, ignore_index=True)
#df.to_parquet(f"{DATA_PATH}all_days_st2.parquet")

df = pd.read_parquet(f"{DATA_PATH}all_days_st2.parquet")

In [None]:
def get_time_calculations(df):
    ## time calculations
    df = df.assign(
        date = pd.to_datetime(df.date),
        departure_time = df.departure_time.dropna().apply(utils.fix_gtfs_time),
    )

    # Something weird comes up trying to generate departure_dt
    # pd.to_datetime() gives today's date
    # datetime.strptime gives year 1900
    # Either way, we have the service date, and later subsetting between 5am-9pm will address this
    df = df.assign(
        departure_time = pd.to_datetime(df.departure_time),
        departure_hour = pd.to_datetime(df.departure_time).dt.hour,
    )
    
    # Any observation with NaTs for departure time get dropped
    # Will create issue later when grouping is done with departure hour
    df = df[df.departure_time.notna()].reset_index(drop=True)
    
    return df


def calculate_runtime_hourlytrips(df):
    # Calculate run time for a trip
    # Find the median stop (keep that observation)
    group_cols = ['trip_key', 'day_name']
    df = df.assign(
        mindt = df.groupby(group_cols)["departure_time"].transform("min"),
        maxdt = df.groupby(group_cols)["departure_time"].transform("max"),
        middle_stop = df.groupby(["trip_key", "day_name"])["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})

    df = df.assign(
        runtime_seconds = (df.maxdt - df.mindt).dt.seconds
    ).drop(columns = ["mindt", "maxdt"])
    
    # Drop any trips with runtime of NaN calculated
    df = df[df.runtime_seconds.notna()].reset_index(drop=True)

    # Still want to use this to merge on the mean runtime info
    middle_stops = df >> filter(_.stop_sequence == _.middle_stop)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", 
                                                  "route_id", "shape_id", 
                                                  "departure_hour", "day_name"])
                            ["runtime_seconds"].transform("mean")
                           )
    )
    
    debug_me = middle_stops[middle_stops.mean_runtime_min.isna()][
        ["calitp_itp_id", "shape_id", "trip_key"]]
    print("Debug errors for NaN mean runtimes")
    print(debug_me.head())
    # Why are there some NaNs from this, when NaNs were dropped before?
    # Some are due to no departure_time (handle it above by dropping NaTs)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.mean_runtime_min.dropna()     
                            .apply(lambda x: int(round(x) / 60))
                           )
    )   
    
    # Add trips per hour column
    shape_frequency = (
        middle_stops
        >> count(_.calitp_itp_id, _.route_id,
                 _.shape_id, _.departure_hour, _.day_name, sort = True)
        >> rename(trips_per_hour = "n")
        >> inner_join(_, middle_stops, 
                      on = ["calitp_itp_id", "day_name", 
                            "shape_id", "departure_hour", "route_id"])
    )
    
    # Now, data is at the trip-level (trip_key) still present
    # Drop duplicates, but no aggregation because trips_per_hour and mean_runtime 
    # are already correctly generated at the route-level, across trips in that departure hour
    shape_frequency = shape_frequency.drop_duplicates(subset=[
        "calitp_itp_id", "shape_id", "departure_hour",
        "day_name", "route_id"])
    
    # There's an aggregation to deal with multiple route_ids that share same shape_id
    # If there are still multiple route_ids, then aggregate and sum / mean
    # Modify this to include itp_id into the groupby
    shape_frequency2 = (shape_frequency.groupby(
        ["calitp_itp_id", "shape_id", "departure_hour", "day_name"])
                        .agg({"route_id": "max", 
                              "trips_per_hour": "sum", 
                              "mean_runtime_min": "mean"
                             }).reset_index()
                       )
    
    # Now, drop ITP_ID==200 to use individual operator feeds
    shape_frequency3 = shape_frequency2 >> filter(_.calitp_itp_id != 200)
    
    return shape_frequency3


def attach_funding(all_operators_df):
    # This is a small query, can leave it here
    with_funding = (tbl.views.transitstacks()
                    >> select(_.calitp_itp_id == _.itp_id, _.ntd_id, 
                              _.transit_provider, _._5307_funds, _._5311_funds,
                              _.operating_expenses_total_2019)
                    >> collect()
                    >> right_join(_, all_operators_df, on = 'calitp_itp_id')
                   )
    
    def fix_funds(value):
        if type(value) != str:
            return None
        else:
            return int(value.replace('$', '').replace(',', ''))
        
    funding_cols = ["_5307_funds", "_5311_funds", "operating_expenses_total_2019"] 
    for c in funding_cols:
        with_funding[c] = with_funding[c].apply(fix_funds)
    
    return with_funding

In [None]:
## Debug and check some of the cases 
# Just look at LA Metro for now
# See if the 30 trips per hour comes up again
df = pd.read_parquet(f"{DATA_PATH}all_days_st2.parquet")
df = df[df.calitp_itp_id==182]

df = get_time_calculations(df)

In [None]:
def to_debug(df):
    group_cols = ['trip_key', 'day_name']
    df = df.assign(
        mindt = df.groupby(group_cols)["departure_time"].transform("min"),
        maxdt = df.groupby(group_cols)["departure_time"].transform("max"),
        middle_stop = df.groupby(["trip_key", "day_name"])["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})

    df = df.assign(
        runtime_seconds = (df.maxdt - df.mindt).dt.seconds
    ).drop(columns = ["mindt", "maxdt"])

    # Still want to use this to merge on the mean runtime info
    middle_stops = df >> filter(_.stop_sequence == _.middle_stop)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", 
                                                  "route_id", "shape_id", 
                                                  "departure_hour", "day_name"])
                            ["runtime_seconds"].transform("mean")
                            .apply(lambda x: int(round(x) / 60)))
    )
    
    # Add trips per hour column
    shape_frequency = (
        middle_stops
        >> count(_.calitp_itp_id, _.route_id,
                 _.shape_id, _.departure_hour, _.day_name, sort = True)
        >> rename(trips_per_hour = "n")
        >> inner_join(_, middle_stops, 
                      on = ["calitp_itp_id", "day_name", 
                            "shape_id", "departure_hour", "route_id"])
    )
    '''
    # Now, data is at the trip-level (trip_key) still present
    # Drop duplicates, but no aggregation because trips_per_hour and mean_runtime 
    # are already correctly generated at the route-level, across trips in that departure hour
    shape_frequency = shape_frequency.drop_duplicates(subset=[
        "calitp_itp_id", "shape_id", "departure_hour",
        "day_name", "route_id"])
    
    # There's an aggregation to deal with multiple route_ids that share same shape_id
    # If there are still multiple route_ids, then aggregate and sum / mean
    # Modify this to include itp_id into the groupby
    shape_frequency2 = (shape_frequency.groupby(
        ["calitp_itp_id", "shape_id", "departure_hour", "day_name"])
                        .agg({"route_id": "max", 
                              "trips_per_hour": "sum", 
                              "mean_runtime_min": "mean"
                             }).reset_index()
                       )
    '''
    return shape_frequency

In [None]:
df2 = to_debug(df)

group_cols = ["calitp_itp_id", "shape_id", "departure_hour", "day_name", "route_id"]
df2 = df2.assign(
    obs = df2.groupby(group_cols)["trip_key"].cumcount() + 1,
)

df2 = df2.assign(
    max_obs = df2.groupby(group_cols)["obs"].transform("max")
)

In [None]:
df2.max_obs.max()

In [None]:
df2[df2.max_obs==19]

In [None]:
# Ok, to deal with duplicates, because there are multiple trips
# with same departure hour, but that trips_per_hour is already generated correctly
# Just drop duplicates, don't need to sum it up trips_per_hour
# The mean runtime is already derived across all these trips
df3 = df2.drop_duplicates(subset=["calitp_itp_id", "shape_id", "departure_hour",
                                  "day_name", "route_id"])

In [None]:
## Aggregation to deal with multiple route_ids that share same shape_id
# It makes 3 trips per hour, same 
df3[df3.shape_id=="964395_shp"]


In [None]:
keep_trips = [-7711476650844868921,
              -6084928573786923571]
silver = df2[(df2.route_id.str.contains("910")) & 
            (df2.departure_hour == 9) & (df2.trip_key.isin(keep_trips))]

In [None]:
silver.runtime_seconds.value_counts()

In [None]:
silver

In [None]:
(3450 +5100)/2

In [None]:
# Still want to use this to merge on the mean runtime info
middle_stops = df >> filter(_.stop_sequence == _.middle_stop)

In [None]:
middle_stops[middle_stops.trip_key.isin(keep_trips)]

In [None]:
middle_stops = middle_stops.assign(
    mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", "route_id", "shape_id", 
                          "departure_hour", "day_name"])
                        ["runtime_seconds"].transform("mean")
                        .apply(lambda x: int(round(x) / 60)))
)

In [None]:
# With these 2 trips, it's because the starting stop has departure hour = 9, but by 
# middle stop, it's departure hour = 10. 
# Allow departure hour to differ for trip_keys then, since we want to stick with middle stop
middle_stops[middle_stops.trip_key.isin(keep_trips)]

In [None]:
shape_frequency = (
    middle_stops
    >> count(_.calitp_itp_id, _.route_id,
             _.shape_id, _.departure_hour, _.day_name, sort = True)
    >> rename(trips_per_hour = "n")
    >> inner_join(_, middle_stops, 
                  on = ["calitp_itp_id", "day_name", 
                        "shape_id", "departure_hour", "route_id"])
)
    

In [None]:
shape_frequency[shape_frequency.trip_key.isin(keep_trips)]

In [None]:
multiple_shapes = (shape_frequency.groupby(["shape_id", "day_name", "departure_hour"])
                   .agg({"route_id": "nunique"})
                   .reset_index()
                  )
                  

In [None]:
all_operators_shape_frequency = pd.read_parquet(f"{utils.GCS_FILE_PATH}shape_frequency.parquet")

In [None]:
single = all_operators_shape_frequency[all_operators_shape_frequency.calitp_itp_id==279]

In [None]:
(single >> filter(_.shape_id == '964395_shp')).dropna()

In [None]:
debug = pd.read_parquet("./data/test/timecalc_thurs.parquet")

error_trips = [
    #thurs
    4561616254186924304,
    -3573784532994111184,
    -5890052631007121734,
    # sat
    #8542179829811914215,
    #4330243390808932084,
    #-5417826296843621461
]

debug = debug[debug.trip_key.isin(error_trips)]
debug

In [None]:
all_operators_shape_frequency = pd.read_parquet(f"{utils.GCS_FILE_PATH}shape_frequency.parquet")
all_operators_shape_frequency[
    (all_operators_shape_frequency.calitp_itp_id.isin(debug.calitp_itp_id)) & 
    (all_operators_shape_frequency.route_id.isin(debug.route_id)) & 
    (all_operators_shape_frequency.mean_runtime_min.isna())
]

In [None]:
my_shape_frequency = pd.read_parquet("./data/test/shape_frequency.parquet")

In [None]:
my_shape_frequency[(my_shape_frequency.calitp_itp_id.isin(debug.calitp_itp_id)) &
                   (my_shape_frequency.route_id.isin(debug.route_id) & 
                    (my_shape_frequency.mean_runtime_min.isna())
                   )]

In [None]:

my_shape_frequency.trips_per_hour.describe()