# Sanity Check

In [1]:
import datetime as dt
import geopandas as gpd
import numpy as np
import pandas as pd

import utils
import shared_utils

from calitp.tables import tbl
from siuba import *



## shapes_initial

Still big differences in `shapes_initial` observations. Dig into sources of differences, since now the df is expanded to include 0's even where there is no service.

In [2]:
DATA_PATH = "./data/test/"

def check_shapes_initial(DATA_PATH):
    df = gpd.read_parquet(f"{DATA_PATH}shapes_initial.parquet")
    # Check unique shape_ids 
    df2 = df.groupby(["calitp_itp_id"]).agg({"shape_id": "nunique"}).reset_index()
    return df2

In [3]:
m1 = pd.merge(check_shapes_initial(utils.GCS_FILE_PATH), 
              check_shapes_initial(DATA_PATH), 
              on = "calitp_itp_id",
              how = "outer",
              validate = "1:1",
              indicator=True
             )

In [4]:
m1._merge.value_counts()

both          129
right_only     18
left_only      13
Name: _merge, dtype: int64

Fron 59 operators that are in my `shapes_initial` but not Eric's, down to 18 not in Eric's, and Eric's has 13 not in mine.

In [5]:
m1[m1._merge=="right_only"].calitp_itp_id.unique()

array([  6,  48,  56,  61, 106, 110, 127, 170, 183, 194, 208, 238, 278,
       280, 290, 295, 346, 350])

In [6]:
m1[m1._merge=="left_only"].calitp_itp_id.unique()

array([ 35,  36,  37, 137, 142, 154, 167, 192, 235, 265, 294, 339, 361])

In [7]:
in_both = m1[m1._merge=="both"]

in_both = in_both.assign(
    category = in_both.apply(lambda x: "equal" if 
                             x.shape_id_x == x.shape_id_y 
                            else "less" if x.shape_id_x < x.shape_id_y 
                            else "more" , axis=1)
)

In [8]:
print(f"# shape_ids in Eric's for `both`: {in_both.shape_id_x.sum()}")
print(f"# shape_ids in Tiff's for `both`: {in_both.shape_id_y.sum()}")

# These numbers are much closer, and this is reasonable

# shape_ids in Eric's for `both`: 6280.0
# shape_ids in Tiff's for `both`: 6263.0


In [9]:
print(f"# shape_ids in Tiff's for `right_only`: {m1[m1._merge=='right_only'].shape_id_y.sum()}")

# shape_ids in Tiff's for `right_only`: 2055.0


In [11]:
print(f"# shape_ids in Eric's for `left_only`: {m1[m1._merge=='left_only'].shape_id_x.sum()}")

# shape_ids in Eric's for `left_only`: 936.0


For the operators that are in common to both, the unique `shape_ids` are in the same reasonable ballpark. They will differ because `shapes.txt` only shows the most recent, so cannot extract the exact same shapes now as when Eric initially ran this.

Ideally, have a `dim_shapes` table to grab the `shape_id` for the actual date of service.

So, the 59 operators that show up in my df are contributing the other 7k observations, and that explains the difference. But, looking at that list of ITP_IDs, some of these are not in the current `agencies.yml`, so will have to remove them. Hopefully, only a handful of agencies are left.

After rerunning:
* Operators that are in my df, not in Eric's: Alhambra Community Transit, B-Line, Capitol Corridor, County Connection
* Operators in Eric's df, not in mine: La Campana, Bell Gardens, Bellflower Bus, Huntington Park Express

In [12]:
m1[m1._merge=="right_only"].calitp_itp_id.unique()

array([  6,  48,  56,  61, 106, 110, 127, 170, 183, 194, 208, 238, 278,
       280, 290, 295, 346, 350])

In [13]:
m1[m1._merge=="left_only"].calitp_itp_id.unique()

array([ 35,  36,  37, 137, 142, 154, 167, 192, 235, 265, 294, 339, 361])

Refering to `traffic_ops/prep_data` to see how to get the latest_itp_ids from `views.gtfs_schedule_dim_feeds`...if this is put together with `views.gtfs_schedule_fact_daily_feed_files`, which allows you to grab the `feed_key` for a certain date.

But, should `calitp_id_is_in_latest == True` be used from `views.gtfs_schedule_dim_feeds`? Latest is the latest version of `agencies.yml`, but not necessarily for the Oct date of analysis.

Somehow, the list of operators needs to be pared down? But not sure if the all 59 operators should be dropped? If those operators weren't in Eric's, they wouldn't have made it through the inner join to get the line geometry. But, it's possible that right now, they're in mine, but not all should be, and after the dataset gets expanded to hold 0's for no service, it expands the dataset much larger than it should be?


This shows that 54 out of the 59 are operators that don't appear in `shape_frequency`, and once we get rid of these, the operators list should be much more similar. Only 5 obs different.

In [29]:
eric = m1[m1._merge=="left_only"][["calitp_itp_id"]].assign(status="eric")
tiff = m1[m1._merge=="right_only"][["calitp_itp_id"]].assign(status="tiff")

debug = pd.concat([eric, tiff], ignore_index=True, axis=0)

debug.head(2)

Unnamed: 0,calitp_itp_id,status
0,35,eric
1,36,eric


In [28]:
latest_itp_id = (tbl.views.gtfs_schedule_dim_feeds()
                 #>> filter(_.calitp_id_in_latest==True)
                 >> select(_.calitp_itp_id, _.calitp_id_in_latest)
                 >> distinct()
                 >> collect()
                )

latest_itp_id.head(2)

Unnamed: 0,calitp_itp_id,calitp_id_in_latest
0,1,False
1,2,False


In [32]:
m2 = pd.merge(latest_itp_id, 
              debug, 
              on = "calitp_itp_id",
              how = "outer",
              validate = "m:1",
              indicator=True)

m2._merge.value_counts()

left_only     176
both           32
right_only      0
Name: _merge, dtype: int64

In [35]:
# Only Alhambra (id=6) had 2 different statuses
# It's possible that other differences arose because of coercing datetime,
# dropping when there were NaT or mean runtimes couldn't be calculated
m2[m2._merge=="both"].sort_values(["calitp_itp_id", "calitp_id_in_latest"])

Unnamed: 0,calitp_itp_id,calitp_id_in_latest,status,_merge
50,6,False,tiff,both
51,6,True,tiff,both
110,35,True,eric,both
111,36,True,eric,both
112,37,True,eric,both
5,48,True,tiff,both
6,56,True,tiff,both
7,61,True,tiff,both
54,106,True,tiff,both
55,110,True,tiff,both


In [36]:
m2[m2._merge=="both"].status.value_counts()

tiff    19
eric    13
Name: status, dtype: int64