# Sanity Check

In [1]:
import datetime as dt
import geopandas as gpd
import numpy as np
import pandas as pd

import utils
import shared_utils

from calitp.tables import tbl
from siuba import *



## shapes_initial

Still big differences in `shapes_initial` observations. Dig into sources of differences, since now the df is expanded to include 0's even where there is no service.

In [2]:
DATA_PATH = "./data/test/"

def check_shapes_initial(DATA_PATH):
    df = gpd.read_parquet(f"{DATA_PATH}shapes_initial.parquet")
    # Check unique shape_ids 
    df2 = df.groupby(["calitp_itp_id"]).agg({"shape_id": "nunique"}).reset_index()
    return df2

In [3]:
m1 = pd.merge(check_shapes_initial(utils.GCS_FILE_PATH), 
              check_shapes_initial(DATA_PATH), 
              on = "calitp_itp_id",
              how = "outer",
              validate = "1:1",
              indicator=True
             )

In [4]:
m1._merge.value_counts()

both          129
right_only     18
left_only      13
Name: _merge, dtype: int64

Fron 59 operators that are in my `shapes_initial` but not Eric's, down to 18 not in Eric's, and Eric's has 13 not in mine.

In [5]:
m1[m1._merge=="right_only"].calitp_itp_id.unique()

array([  6,  48,  56,  61, 106, 110, 127, 170, 183, 194, 208, 238, 278,
       280, 290, 295, 346, 350])

In [6]:
m1[m1._merge=="left_only"].calitp_itp_id.unique()

array([ 35,  36,  37, 137, 142, 154, 167, 192, 235, 265, 294, 339, 361])

In [7]:
in_both = m1[m1._merge=="both"]

in_both = in_both.assign(
    category = in_both.apply(lambda x: "equal" if 
                             x.shape_id_x == x.shape_id_y 
                            else "less" if x.shape_id_x < x.shape_id_y 
                            else "more" , axis=1)
)

In [8]:
print(f"# shape_ids in Eric's for `both`: {in_both.shape_id_x.sum()}")
print(f"# shape_ids in Tiff's for `both`: {in_both.shape_id_y.sum()}")

# These numbers are much closer, and this is reasonable

# shape_ids in Eric's for `both`: 6280.0
# shape_ids in Tiff's for `both`: 6263.0


In [9]:
print(f"# shape_ids in Tiff's for `right_only`: {m1[m1._merge=='right_only'].shape_id_y.sum()}")

# shape_ids in Tiff's for `right_only`: 2055.0


In [10]:
print(f"# shape_ids in Eric's for `left_only`: {m1[m1._merge=='left_only'].shape_id_x.sum()}")

# shape_ids in Eric's for `left_only`: 936.0


For the operators that are in common to both, the unique `shape_ids` are in the same reasonable ballpark. They will differ because `shapes.txt` only shows the most recent, so cannot extract the exact same shapes now as when Eric initially ran this.

Ideally, have a `dim_shapes` table to grab the `shape_id` for the actual date of service.

So, the 59 operators that show up in my df are contributing the other 7k observations, and that explains the difference. But, looking at that list of ITP_IDs, some of these are not in the current `agencies.yml`, so will have to remove them. Hopefully, only a handful of agencies are left.

After rerunning:
* Operators that are in my df, not in Eric's: Alhambra Community Transit, B-Line, Capitol Corridor, County Connection
* Operators in Eric's df, not in mine: La Campana, Bell Gardens, Bellflower Bus, Huntington Park Express

In [11]:
m1[m1._merge=="right_only"].calitp_itp_id.unique()

array([  6,  48,  56,  61, 106, 110, 127, 170, 183, 194, 208, 238, 278,
       280, 290, 295, 346, 350])

In [12]:
m1[m1._merge=="left_only"].calitp_itp_id.unique()

array([ 35,  36,  37, 137, 142, 154, 167, 192, 235, 265, 294, 339, 361])

Refering to `traffic_ops/prep_data` to see how to get the latest_itp_ids from `views.gtfs_schedule_dim_feeds`...if this is put together with `views.gtfs_schedule_fact_daily_feed_files`, which allows you to grab the `feed_key` for a certain date.

But, should `calitp_id_is_in_latest == True` be used from `views.gtfs_schedule_dim_feeds`? Latest is the latest version of `agencies.yml`, but not necessarily for the Oct date of analysis.

Somehow, the list of operators needs to be pared down? But not sure if the all 59 operators should be dropped? If those operators weren't in Eric's, they wouldn't have made it through the inner join to get the line geometry. But, it's possible that right now, they're in mine, but not all should be, and after the dataset gets expanded to hold 0's for no service, it expands the dataset much larger than it should be?


This shows that 54 out of the 59 are operators that don't appear in `shape_frequency`, and once we get rid of these, the operators list should be much more similar. Only 5 obs different.

In [13]:
eric = m1[m1._merge=="left_only"][["calitp_itp_id"]].assign(status="eric")
tiff = m1[m1._merge=="right_only"][["calitp_itp_id"]].assign(status="tiff")

debug = pd.concat([eric, tiff], ignore_index=True, axis=0)

debug.head(2)

Unnamed: 0,calitp_itp_id,status
0,35,eric
1,36,eric


In [14]:
latest_itp_id = (tbl.views.gtfs_schedule_dim_feeds()
                 #>> filter(_.calitp_id_in_latest==True)
                 >> select(_.calitp_itp_id, _.calitp_id_in_latest)
                 >> distinct()
                 >> collect()
                )

latest_itp_id.head(2)



Unnamed: 0,calitp_itp_id,calitp_id_in_latest
0,1,False
1,2,False


In [15]:
m2 = pd.merge(latest_itp_id, 
              debug, 
              on = "calitp_itp_id",
              how = "outer",
              validate = "m:1",
              indicator=True)

m2._merge.value_counts()

left_only     176
both           32
right_only      0
Name: _merge, dtype: int64

In [16]:
# Only Alhambra (id=6) had 2 different statuses
# It's possible that other differences arose because of coercing datetime,
# dropping when there were NaT or mean runtimes couldn't be calculated
m2[m2._merge=="both"].sort_values(["calitp_itp_id", "calitp_id_in_latest"])

Unnamed: 0,calitp_itp_id,calitp_id_in_latest,status,_merge
50,6,False,tiff,both
51,6,True,tiff,both
110,35,True,eric,both
111,36,True,eric,both
112,37,True,eric,both
5,48,True,tiff,both
6,56,True,tiff,both
7,61,True,tiff,both
54,106,True,tiff,both
55,110,True,tiff,both


In [17]:
m2[m2._merge=="both"].status.value_counts()

tiff    19
eric    13
Name: status, dtype: int64

## Use old `shape_id`

In [18]:
def compare_service_tract_type(DATA_PATH):

    service = pd.read_parquet(f"{DATA_PATH}shape_frequency_funding.parquet")
    processed_shapes = gpd.read_parquet(f'{DATA_PATH}shapes_processed.parquet')

    service_tract_type = pd.merge(
        (service 
         >> select(_.calitp_itp_id, _.shape_id, 
                   _.day_name, _.departure_hour, 
                   _.trips_per_hour, _.mean_runtime_min)
        ),
        (processed_shapes
         >> select(_.calitp_itp_id, _.shape_id, _.tract_type)
        ),
        on = ['calitp_itp_id', 'shape_id'],
        how = "inner",
        validate = "m:1",
    )
    
    m1 = (service_tract_type.groupby(["calitp_itp_id", 
                                      "day_name", "tract_type"])
          .agg({"shape_id": "nunique",
              "mean_runtime_min": "mean",
                "trips_per_hour": "mean",
               }).reset_index()
         )
    
    return m1


In [19]:
eric = compare_service_tract_type(f"{utils.GCS_FILE_PATH}")
tiff = compare_service_tract_type(f"{DATA_PATH}")

In [20]:
m1 = pd.merge(eric, 
         tiff, 
         on = ["calitp_itp_id", "day_name", "tract_type"],
         how = "outer",
         validate = "1:1",
         indicator=True
        )

In [21]:
both = m1[m1._merge=="both"]
len(both[both.shape_id_x != both.shape_id_y])

255

In [22]:
m1[m1._merge=="left_only"].calitp_itp_id.unique()

array([ 35,  36,  37,  75,  93, 137, 142, 154, 167, 182, 192, 232, 235,
       264, 265, 282, 284, 294, 300, 301, 308, 310, 327, 339, 361, 368,
       381])

In [23]:
m1[m1._merge=="right_only"].calitp_itp_id.unique()

array([  6,  56, 106, 110, 127, 170, 183, 194, 208, 238, 247, 278, 280,
       290, 295, 346, 350])

## Create correct data with right `shape_id`

In [24]:
# Keep original shapes_initial and shapes_processed
# Take correct shape_frequency_funding edits
# Put it with original shapes_processed
# Rerun to get to the increase_by_operator parquet

def merge_routes_with_tract_type(SERVICE_PATH, SHAPES_PATH):
    
    service = pd.read_parquet(f"{SERVICE_PATH}shape_frequency_funding.parquet")
    processed_shapes = gpd.read_parquet(f'{SHAPES_PATH}shapes_processed.parquet')
    
    service_tract_type = pd.merge(
        (service 
         >> select(_.calitp_itp_id, _.shape_id, 
                   _.day_name, _.departure_hour, 
                   _.trips_per_hour, _.mean_runtime_min)
        ),
        (processed_shapes
         >> select(_.calitp_itp_id, _.shape_id, _.tract_type)
        ),
        on = ['calitp_itp_id', 'shape_id'],
        how = "inner",
        validate = "m:1",
    )
    
    # Filter and keep 5am-9pm hours
    service_tract_type = (service_tract_type 
                          >> filter(_.departure_hour > 4, _.departure_hour < 21) 
                          ## filter for performance
                         )
    
    service_tract_type = service_tract_type.assign(
        min_runtime_min = (service_tract_type.groupby(["calitp_itp_id", "shape_id"])
                       ["mean_runtime_min"].transform("min")
                      )
    )
    
    service_tract_type = (service_tract_type
                      .dropna(subset=["tract_type", "min_runtime_min"])
                      .reset_index(drop=True)
                     )
    
    
    ## runtime for analysis is the mean runtime for a shape/day/hour for existing service,
    # or the min runtime for new service
    service_tract_type = service_tract_type.assign(
        runtime = service_tract_type[
            ["mean_runtime_min", "min_runtime_min"]].max(axis=1).astype(int)
    )
    
    return service_tract_type

In [25]:
service_tract_type = merge_routes_with_tract_type(DATA_PATH, utils.GCS_FILE_PATH)

In [26]:
import setup_service_increase

service_combined = setup_service_increase.calculate_additional_trips_service_hours(
    service_tract_type)

service_combined.to_parquet(f"{DATA_PATH}service_increase_new.parquet")

In [27]:
def calculate_operator_capex(service_increase_df, ntd_joined):
    # Bring in service df
    by_operator = (service_increase_df.groupby(['calitp_itp_id', 'tract_type'])
                   [['addl_service_hrs_annual']].sum()
                  )
    
    ## https://ww2.arb.ca.gov/resources/documents/transit-fleet-cost-model
    BUS_COST = 776_941
    BUS_SERVICE_LIFE = 14 # use this assumption
    
    MEDIAN_VRH_PER_BUS = ntd_joined['vrh_per_bus'].median()
    
    by_operator['additional_buses'] = by_operator.addl_service_hrs_annual / MEDIAN_VRH_PER_BUS
    by_operator['bus_capex'] = by_operator.additional_buses * BUS_COST
    by_operator['bus_capex_annualized'] = by_operator['bus_capex'] / BUS_SERVICE_LIFE

    return by_operator

In [28]:
ntd_joined = pd.read_parquet(f"{utils.GCS_FILE_PATH}vehicles_ntd_joined.parquet")

hours_by_operator = calculate_operator_capex(service_combined, ntd_joined)
hours_by_operator.to_parquet(f'{DATA_PATH}increase_by_operator.parquet')

In [29]:
for file in ["service_increase_new", "increase_by_operator"]:
    df = pd.read_parquet(f"{DATA_PATH}{file}.parquet")
    
    if file=="service_increase_new":
        new_file = "service_increase"
    else:
        new_file = file
        
    df.to_parquet(f"{utils.GCS_FILE_PATH}test/{new_file}.parquet")

In [30]:
hours_by_operator.groupby('tract_type').sum()

Unnamed: 0_level_0,addl_service_hrs_annual,additional_buses,bus_capex,bus_capex_annualized
tract_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rural,5652156.0,2308.70491,1793728000.0,128123400.0
suburban,1744042.0,712.37908,553476500.0,39534040.0
urban,41967830.0,17142.365627,13318610000.0,951329000.0


In [31]:
hours_by_operator.sum()

addl_service_hrs_annual    4.936403e+07
additional_buses           2.016345e+04
bus_capex                  1.566581e+10
bus_capex_annualized       1.118986e+09
dtype: float64

In [32]:
ix = pd.IndexSlice
hours_by_operator.loc[ix[279,:]]

Unnamed: 0_level_0,addl_service_hrs_annual,additional_buses,bus_capex,bus_capex_annualized
tract_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
urban,501460.266667,204.828686,159139800.0,11367130.0


## Compare hours_by_operator

In [33]:
eric = pd.read_parquet(f"{utils.GCS_FILE_PATH}increase_by_operator.parquet")
tiff = pd.read_parquet(f"{utils.GCS_FILE_PATH}test/increase_by_operator.parquet")

In [34]:
cols = list(eric.columns)

In [35]:
eric = eric.add_prefix('e_')
tiff = tiff.add_prefix('t_')

In [36]:
m2 = pd.merge(eric, tiff, left_index=True, right_index=True, 
         how = "outer", validate = "1:1", indicator=True)
m2._merge.value_counts()

both          230
left_only      27
right_only      1
Name: _merge, dtype: int64

In [37]:
for c in cols:
    eric_col = f"e_{c}"
    tiff_col = f"t_{c}"
    new_name = f"diff_{c}"
    m2 = m2.assign(
        new_col = m2[eric_col] - m2[tiff_col]
    ).rename(columns = {"new_col": new_name})

In [38]:
for c in cols:
    print(f"***************{c}*******************")
    difference_col = f"diff_{c}"
    print("Overall")
    print(f"# obs: {len(m2)}")
    print(f"# obs with differences: {len(m2[m2[difference_col] != 0])}")
    print(m2[difference_col].sum())
    
    print("Both Only")
    both = m2[m2._merge=='both']
    print(f"# obs: {len(both)}")
    print(f"# obs with differences: {len(both[both[difference_col] != 0])}")
    print(both[difference_col].sum())

***************addl_service_hrs_annual*******************
Overall
# obs: 258
# obs with differences: 106
2270729.9333333317
Both Only
# obs: 230
# obs with differences: 78
2270729.9333333317
***************additional_buses*******************
Overall
# obs: 258
# obs with differences: 106
927.5124243462368
Both Only
# obs: 230
# obs with differences: 78
927.5124243462368
***************bus_capex*******************
Overall
# obs: 258
# obs with differences: 106
720622430.4839896
Both Only
# obs: 230
# obs with differences: 78
720622430.4839896
***************bus_capex_annualized*******************
Overall
# obs: 258
# obs with differences: 106
51473030.74885637
Both Only
# obs: 230
# obs with differences: 78
51473030.748856366
