In [1]:
import datetime as dt
import geopandas as gpd
import pandas as pd

#import create_calenviroscreen_lehd_data
import utils
import shared_utils

from siuba import *



## Compare `shape_frequency`

Break down each step and figure out why going into `shape_frequency_funding` the observations differ so much.

In [2]:
DATA_PATH = "./data/test/"

In [3]:
def compare_shape_frequency(DATA_PATH):
    df = pd.read_parquet(f"{DATA_PATH}shape_frequency.parquet")
    print(f"# obs: {len(df)}")
    
    group_cols = ["calitp_itp_id", "day_name"]
    df2 = df.groupby(group_cols).agg({"shape_id": "nunique"}).reset_index()
    print(f"# unique id-shape_id-day-name: {len(df2)}")
    
    group_cols = ["calitp_itp_id", "day_name"]
    df3 = df.groupby(group_cols).agg({"route_id": "nunique"}).reset_index()
    print(f"# unique id-route-id-day-name: {len(df2)}")

    check_ids = [182, 4, 279]
    print(f"check specific ids: {check_ids}")
    print("Unique shape_id-day_name")
    
    display(df2[df2.calitp_itp_id.isin(check_ids)])
    print("Unique route_id-day_name")
    display(df3[df3.calitp_itp_id.isin(check_ids)])
    
    return df2, df3

In [4]:
tiff_shape, tiff_route = compare_shape_frequency("./data/test/")

# obs: 95933
# unique id-shape_id-day-name: 363
# unique id-route-id-day-name: 363
check specific ids: [182, 4, 279]
Unique shape_id-day_name


Unnamed: 0,calitp_itp_id,day_name,shape_id
0,4,Saturday,150
1,4,Sunday,150
2,4,Thursday,323
171,182,Saturday,590
172,182,Sunday,590
173,182,Thursday,683
258,279,Saturday,21
259,279,Sunday,13
260,279,Thursday,24


Unique route_id-day_name


Unnamed: 0,calitp_itp_id,day_name,route_id
0,4,Saturday,59
1,4,Sunday,59
2,4,Thursday,128
171,182,Saturday,113
172,182,Sunday,111
173,182,Thursday,112
258,279,Saturday,10
259,279,Sunday,6
260,279,Thursday,10


In [5]:
eric_shape, eric_route = compare_shape_frequency(f"{utils.GCS_FILE_PATH}")

# obs: 423216
# unique id-shape_id-day-name: 441
# unique id-route-id-day-name: 441
check specific ids: [182, 4, 279]
Unique shape_id-day_name


Unnamed: 0,calitp_itp_id,day_name,shape_id
0,4,Saturday,330
1,4,Sunday,330
2,4,Thursday,330
207,182,Saturday,734
208,182,Sunday,734
209,182,Thursday,734
327,279,Saturday,28
328,279,Sunday,28
329,279,Thursday,28


Unique route_id-day_name


Unnamed: 0,calitp_itp_id,day_name,route_id
0,4,Saturday,128
1,4,Sunday,128
2,4,Thursday,128
207,182,Saturday,118
208,182,Sunday,118
209,182,Thursday,118
327,279,Saturday,10
328,279,Sunday,10
329,279,Thursday,10


Interestingly, Eric pulls same number of route_ids for each day_name, but mine fluctuates.

Check which line in the code does that...must be something like `groupby.agg` that includes a `day_name` in there and counts or finds nunique `route_id` or a `drop_duplicates` that gives this.

In [6]:
tiff_shape = pd.read_parquet(f"{DATA_PATH}shape_frequency.parquet")
eric_shape = pd.read_parquet(f"{utils.GCS_FILE_PATH}shape_frequency.parquet")

In [7]:
len(tiff_shape)

95933

In [8]:
tiff_shape[tiff_shape.mean_runtime_min.notna()].shape

(95933, 7)

In [9]:
len(eric_shape)

423216

In [10]:
eric_shape[eric_shape.mean_runtime_min.notna()].shape

(94453, 7)

### Go up to time calculations.

Look at LA Metro on Thurs.

In [11]:
# Run the query from warehouse_queries to get just LA Metro on Thurs
DATA_PATH = "./data/test/"

eric = pd.read_parquet(f"{DATA_PATH}eric_trips_joined_thurs.parquet")
tiff = pd.read_parquet(f"{DATA_PATH}trips_thurs.parquet")
tiff = tiff[tiff.calitp_itp_id==182].reset_index(drop=True)

In [12]:
# Check if trip keys are the same...they really should be
# --> So we have the same trip_keys

trip_merge = pd.merge(eric[["trip_key"]].drop_duplicates(),
                      tiff[["trip_key"]].drop_duplicates(),
                      on = "trip_key",
                      how = "outer",
                      validate = "1:1",
                      indicator=True)

trip_merge._merge.value_counts()

both          14118
left_only         0
right_only        0
Name: _merge, dtype: int64

In [13]:
def get_time_calculations(df):
    ## time calculations
    df = df.assign(
        date = pd.to_datetime(df.date),
        departure_time = df.departure_time.dropna().apply(utils.fix_gtfs_time),
    )

    # Something weird comes up trying to generate departure_dt
    # pd.to_datetime() gives today's date
    # datetime.strptime gives year 1900
    # Either way, we have the service date, and later subsetting between 5am-9pm will address this
    df = df.assign(
        departure_time = pd.to_datetime(df.departure_time),
        departure_hour = pd.to_datetime(df.departure_time).dt.hour,
    )
    
    # Any observation with NaTs for departure time get dropped
    # Will create issue later when grouping is done with departure hour
    df = df[df.departure_time.notna()].reset_index(drop=True)
    
    return df


In [14]:
eric2 = get_time_calculations(eric)
tiff2 = get_time_calculations(tiff)

In [15]:
from shared_utils import geography_utils

# Aggregate by departure hour, these results should also be the same
eric_hour = geography_utils.aggregate_by_geography(eric2,
                                       group_cols = ["departure_hour"],
                                       nunique_cols = ["route_id", "trip_key"]
                                      )

tiff_hour = geography_utils.aggregate_by_geography(tiff2,
                                       group_cols = ["departure_hour"],
                                       nunique_cols = ["route_id", "trip_key"]
                                      )

In [16]:
hour_merge = pd.merge(eric_hour,
                      tiff_hour,
                      on = "departure_hour",
                      how = "outer",
                      validate = "1:1",
                      indicator=True
                     )

hour_merge._merge.value_counts()

both          24
left_only      0
right_only     0
Name: _merge, dtype: int64

In [17]:
# These assert statements pass, as they should
assert (hour_merge.route_id_x == hour_merge.route_id_y).all()

In [18]:
assert (hour_merge.trip_key_x == hour_merge.trip_key_y).all()

### Check calculation of runtimes 

Probably somewhere in this code is where the datasets differ

Break out each step and double check

In [19]:
## Eric's code

# Don't think swapping this out will have affected anything
# But just leave this intermediate step in anyway,
# just to get the find_runtime function to run, then double check results

def beginning_time_cleaning(df):
    df = df.dropna(subset=['departure_time'])
    # _st_trips_joined = st_trips_joined
    df.departure_time = df.departure_time.apply(utils.fix_gtfs_time)
    df['departure_dt'] = (df['departure_time']
                                 .apply(lambda x:
                                        dt.datetime.strptime(x, '%H:%M:%S'))
                                      )
    df['departure_hour'] = df['departure_dt'].apply(lambda x: x.hour)
    
    return df

eric_time = beginning_time_cleaning(eric)

In [20]:
def find_runtime(df):
    mindt = df[df.stop_sequence == df.stop_sequence.min()].departure_dt.iloc[0]
    maxdt = df[df.stop_sequence == df.stop_sequence.max()].departure_dt.iloc[0]
    td = (maxdt - mindt)
    df['runtime_seconds'] = td.seconds
    return df
    
# try:
st_with_runtimes = eric_time.groupby(['trip_key', 'day_name']).apply(find_runtime)
eric3 = st_with_runtimes >> select(_.trip_key, _.day_name, _.runtime_seconds)

In [21]:
## Tiff's code

def calculate_runtime_hourlytrips_part1(df):
    # Calculate run time for a trip
    # Find the median stop (keep that observation)
    group_cols = ['trip_key', 'day_name']
    df = df.assign(
        mindt = df.groupby(group_cols)["departure_time"].transform("min"),
        maxdt = df.groupby(group_cols)["departure_time"].transform("max"),
        middle_stop = df.groupby(["trip_key", "day_name"])["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})

    df = df.assign(
        runtime_seconds = (df.maxdt - df.mindt).dt.seconds
    ).drop(columns = ["mindt", "maxdt"])
    
    # Drop any trips with runtime of NaN calculated
    df = df[df.runtime_seconds.notna()].reset_index(drop=True)
    
    return df

In [22]:
tiff3 = calculate_runtime_hourlytrips_part1(tiff2)
tiff3  = tiff3 >> select(_.trip_key, _.day_name, _.runtime_seconds) 

In [23]:
runtime_merge = pd.merge(eric3.drop_duplicates(), 
                         tiff3.drop_duplicates(), 
                         on = ["trip_key", "day_name"],
                         how = "outer",
                         validate = "1:1",
                         indicator=True
                        )

runtime_merge._merge.value_counts()

both          14118
left_only         0
right_only        0
Name: _merge, dtype: int64

In [24]:
# This assert statement does not pass
# This would be due to the fact that we have different approaches
# Dig into this
assert (runtime_merge.runtime_seconds_x == runtime_merge.runtime_seconds_y).all()

AssertionError: 

In [25]:
runtime_debug = runtime_merge[runtime_merge.runtime_seconds_x != 
                              runtime_merge.runtime_seconds_y
                             ]

print(f"# obs that differ: {len(runtime_debug)}")
print(f"% obs that differ: {len(runtime_debug) / len(runtime_merge)}")

# obs that differ: 166
% obs that differ: 0.011758039382348774


In [26]:
# 1.2% of the observations differ...that's not a big deal at all
# BUT, why are my runtime_seconds SO HUGE
# Pull out these trip_keys

In [27]:
runtime_debug.head()

Unnamed: 0,trip_key,day_name,runtime_seconds_x,runtime_seconds_y,_merge
28,-5414324691387759528,Thursday,2340,86220,both
334,3410421768507213757,Thursday,480,86340,both
656,450205548115823018,Thursday,1560,86160,both
870,-927572044940070085,Thursday,2580,86340,both
886,-1095567724896778272,Thursday,900,85980,both


In [28]:
# Ahh, this has to do with crossing over the midnight hour
# So, it's not correctly deriving the time
tiff2[tiff2.trip_key.isin(runtime_debug.trip_key)].departure_hour.value_counts()

23    5895
0     4844
22     221
1      117
Name: departure_hour, dtype: int64

In [29]:
keep_trip = [-5414324691387759528]
one_trip = tiff2[tiff2.trip_key.isin(keep_trip)]

one_trip[(one_trip.stop_sequence==one_trip.stop_sequence.max()) |
         (one_trip.stop_sequence==one_trip.stop_sequence.min())
        ]

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,departure_time,stop_id,shape_id,route_id,departure_hour
596732,182,2021-10-07,-5414324691387759528,53917413-SEPT21-D07CAR-1_Weekday,True,Thursday,54,2022-02-03 00:02:00,16770,6020010_SEPT21,602-13149,0
605400,182,2021-10-07,-5414324691387759528,53917413-SEPT21-D07CAR-1_Weekday,True,Thursday,1,2022-02-03 23:23:00,6428,6020010_SEPT21,602-13149,23


In [30]:
def find_runtime_modified(df):
    mindt = df[df.stop_sequence == df.stop_sequence.min()].departure_time.iloc[0]
    maxdt = df[df.stop_sequence == df.stop_sequence.max()].departure_time.iloc[0]
    td = (maxdt - mindt)
    df['runtime_seconds'] = td.seconds
    return df
    

In [31]:
# Go back to this method, because this correctly deals with crossing midnight
one_trip2 = find_runtime_modified(one_trip)
one_trip2[(one_trip2.stop_sequence==one_trip2.stop_sequence.max()) |
         (one_trip2.stop_sequence==one_trip2.stop_sequence.min())
        ]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,departure_time,stop_id,shape_id,route_id,departure_hour,runtime_seconds
596732,182,2021-10-07,-5414324691387759528,53917413-SEPT21-D07CAR-1_Weekday,True,Thursday,54,2022-02-03 00:02:00,16770,6020010_SEPT21,602-13149,0,2340
605400,182,2021-10-07,-5414324691387759528,53917413-SEPT21-D07CAR-1_Weekday,True,Thursday,1,2022-02-03 23:23:00,6428,6020010_SEPT21,602-13149,23,2340


In [32]:
def calculate_runtime_hourlytrips_part1_modified(df):
    # Calculate run time for a trip
    # Find the median stop (keep that observation)
    
    def find_runtime(df):
        mindt = df[df.stop_sequence == df.stop_sequence.min()].departure_time.iloc[0]
        maxdt = df[df.stop_sequence == df.stop_sequence.max()].departure_time.iloc[0]
        td = (maxdt - mindt)
        df['runtime_seconds'] = td.seconds
        return df

    group_cols = ['trip_key', 'day_name']
    df = df.groupby(group_cols).apply(find_runtime)

    df = df.assign(
        middle_stop = df.groupby(group_cols)["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})
    
    return df

In [33]:
tiff3 = calculate_runtime_hourlytrips_part1_modified(tiff2)
tiff3  = tiff3 >> select(_.trip_key, _.day_name, _.runtime_seconds) 

In [34]:
runtime_merge = pd.merge(eric3.drop_duplicates(), 
                         tiff3.drop_duplicates(), 
                         on = ["trip_key", "day_name"],
                         how = "outer",
                         validate = "1:1",
                         indicator=True
                        )

runtime_merge._merge.value_counts()

both          14118
left_only         0
right_only        0
Name: _merge, dtype: int64

In [35]:
# Now the assert passes
assert (runtime_merge.runtime_seconds_x == runtime_merge.runtime_seconds_y).all()

### Go onto middle stops

Make sure the `mean(runtime)` is the same

In [36]:
## Tiffany's code
def calculate_runtime_hourlytrips_part2(df):
    
    # Incorporate part 1 from above
    
    # Calculate run time for a trip
    # Find the median stop (keep that observation)
    
    def find_runtime(df):
        
        mindt = df[df.stop_sequence == df.stop_sequence.min()].departure_time.iloc[0]
        maxdt = df[df.stop_sequence == df.stop_sequence.max()].departure_time.iloc[0]
        td = (maxdt - mindt)
        df['runtime_seconds'] = td.seconds
        return df

    group_cols = ['trip_key', 'day_name']
    df = df.groupby(group_cols).apply(find_runtime)

    df = df.assign(
        middle_stop = df.groupby(group_cols)["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})
    
    df = df[df.runtime_seconds.notna()].reset_index(drop=True)

    # Still want to use this to merge on the mean runtime info
    middle_stops = df >> filter(_.stop_sequence == _.middle_stop)

    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", 
                                                  "route_id", "shape_id", 
                                                  "departure_hour", "day_name"])
                            ["runtime_seconds"].transform("mean")
                           )
    )
    
    return middle_stops    

In [37]:
tiff4 = calculate_runtime_hourlytrips_part2(tiff)
tiff4.head()

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
## Eric's code

def eric_part2(st_trips_joined):
    st_trips_joined = st_trips_joined.dropna(subset=['departure_time'])
    # _st_trips_joined = st_trips_joined
    st_trips_joined.departure_time = st_trips_joined.departure_time.apply(utils.fix_gtfs_time)
    st_trips_joined['departure_dt'] = (st_trips_joined['departure_time']
                                 .apply(lambda x:
                                        dt.datetime.strptime(x, '%H:%M:%S'))
                                      )
    st_trips_joined['departure_hour'] = st_trips_joined['departure_dt'].apply(lambda x: x.hour)
    
    ## calculate runtimes for each trip, if possible
    def find_runtime(df):
        mindt = df[df.stop_sequence == df.stop_sequence.min()].departure_dt.iloc[0]
        maxdt = df[df.stop_sequence == df.stop_sequence.max()].departure_dt.iloc[0]
        td = (maxdt - mindt)
        df['runtime_seconds'] = td.seconds
        return df
    
    st_with_runtimes = st_trips_joined.groupby(['trip_key', 'day_name']).apply(find_runtime)
    st_with_runtimes = st_with_runtimes >> select(_.trip_key, _.day_name, _.runtime_seconds)
    
    
    ## find middle stop for each trip to calculate frequencies
    middle_stops = (st_trips_joined
                    >> group_by(_.calitp_itp_id, _.shape_id)
                    >> summarize(middle_stop = _.stop_sequence.median())
                   )


    middle_stops.middle_stop = middle_stops.middle_stop.astype('int64')
    
    
    ## this logic here is confusing, with middle_st and middle_st_runtimes both having inner joins with st_trips_joined
    # When this is abstracted to all operators, creates more issues
    # probably need more groupby columns
    # error is because cal_itp_id_x, date_x, etc exist, so can't call the columns directly
    # probably works fine for individual operator, 
    # but across all, need to throw more columns in the on=[] for the join and in groupby?
    
    # try:
    middle_st = (middle_stops
                 >> select(_.stop_sequence == _.middle_stop, _.shape_id)
                 >> inner_join(_, st_trips_joined, on=['shape_id', 'stop_sequence'])
                )
    
    
    ## if multiple trips within the hour, calculate mean runtime
    middle_st_runtimes = (middle_stops
         >> inner_join(_, st_trips_joined, on=['trip_key', 'day_name'
                                              ])
         >> group_by(_.calitp_itp_id, _.route_id, _.shape_id, 
                     _.departure_hour, _.day_name)
         >> summarize(mean_runtime_sec = _.runtime_seconds.mean())
         )

    middle_st_runtimes['mean_runtime_min'] = (middle_st_runtimes.mean_runtime_sec
                                              .apply(lambda x: int(round(x) / 60))
                                             )
    
    return middle_st_runtimes
    

In [None]:
eric4 = eric_part2(eric)

In [None]:
eric4.head()

In [None]:
# Modify this as you find errors

def calculate_runtime_hourlytrips_NEW(df):
    # Calculate run time for a trip
    # Find the median stop (keep that observation)
    
    def find_runtime(df):
        mindt = df[df.stop_sequence == df.stop_sequence.min()].departure_time.iloc[0]
        maxdt = df[df.stop_sequence == df.stop_sequence.max()].departure_time.iloc[0]
        td = (maxdt - mindt)
        df['runtime_seconds'] = td.seconds
        return df

    group_cols = ['trip_key', 'day_name']
    df = df.groupby(group_cols).apply(find_runtime)

    df = df.assign(
        middle_stop = df.groupby(group_cols)["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})
    
    ## Left off here
    
    
    # Drop any trips with runtime of NaN calculated
    df = df[df.runtime_seconds.notna()].reset_index(drop=True)

    # Still want to use this to merge on the mean runtime info
    middle_stops = df >> filter(_.stop_sequence == _.middle_stop)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", 
                                                  "route_id", "shape_id", 
                                                  "departure_hour", "day_name"])
                            ["runtime_seconds"].transform("mean")
                           )
    )
    
    debug_me = middle_stops[middle_stops.mean_runtime_min.isna()][
        ["calitp_itp_id", "shape_id", "trip_key"]]
    print("Debug errors for NaN mean runtimes")
    print(debug_me.head())
    # Why are there some NaNs from this, when NaNs were dropped before?
    # Some are due to no departure_time (handle it above by dropping NaTs)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.mean_runtime_min.dropna()     
                            .apply(lambda x: int(round(x) / 60))
                           )
    )   
    
    # Add trips per hour column
    shape_frequency = (
        middle_stops
        >> count(_.calitp_itp_id, _.route_id,
                 _.shape_id, _.departure_hour, _.day_name, sort = True)
        >> rename(trips_per_hour = "n")
        >> inner_join(_, middle_stops, 
                      on = ["calitp_itp_id", "day_name", 
                            "shape_id", "departure_hour", "route_id"])
    )
    
    # Now, data is at the trip-level (trip_key) still present
    # Drop duplicates, but no aggregation because trips_per_hour and mean_runtime 
    # are already correctly generated at the route-level, across trips in that departure hour
    shape_frequency = shape_frequency.drop_duplicates(subset=[
        "calitp_itp_id", "shape_id", "departure_hour",
        "day_name", "route_id"])
    
    # There's an aggregation to deal with multiple route_ids that share same shape_id
    # If there are still multiple route_ids, then aggregate and sum / mean
    # Modify this to include itp_id into the groupby
    shape_frequency2 = (shape_frequency.groupby(
        ["calitp_itp_id", "shape_id", "departure_hour", "day_name"])
                        .agg({"route_id": "max", 
                              "trips_per_hour": "sum", 
                              "mean_runtime_min": "mean"
                             }).reset_index()
                       )
    
    # Now, drop ITP_ID==200 to use individual operator feeds
    shape_frequency3 = shape_frequency2 >> filter(_.calitp_itp_id != 200)
    
    return shape_frequency3


def attach_funding(all_operators_df):
    # This is a small query, can leave it here
    with_funding = (tbl.views.transitstacks()
                    >> select(_.calitp_itp_id == _.itp_id, _.ntd_id, 
                              _.transit_provider, _._5307_funds, _._5311_funds,
                              _.operating_expenses_total_2019)
                    >> collect()
                    >> right_join(_, all_operators_df, on = 'calitp_itp_id')
                   )
    
    def fix_funds(value):
        if type(value) != str:
            return None
        else:
            return int(value.replace('$', '').replace(',', ''))
        
    funding_cols = ["_5307_funds", "_5311_funds", "operating_expenses_total_2019"] 
    for c in funding_cols:
        with_funding[c] = with_funding[c].apply(fix_funds)
    
    return with_funding
