# Double check A1 functions 

Dataset produced at end of A1 goes into A2.

The values won't exactly line up, since dates are slightly different, and the cleaning is different.

But, going into A2, will the tract categorization work as intended?

In [1]:
import datetime as dt
import geopandas as gpd
import intake
import numpy as np
import pandas as pd

from calitp.tables import tbl
from siuba import *

import test_create_calenviroscreen_lehd_data
#import shared_utils
import utils

catalog = intake.open_catalog("./catalog.yml")



## Compare `shapes_processed`

Move to use the `geography_utils` to create transit routes, but `shapes.txt` always reflects the most recent. 

What should have been a simple swap isn't, and even rerunning the A2 notebook will result in a different dataset.

In [None]:
def merge_and_filter(DATA_PATH):
    shapes = gpd.read_parquet(f"{DATA_PATH}shapes_processed.parquet")
    service = pd.read_parquet(f"{DATA_PATH}shape_frequency_funding.parquet")
    
    
    # Merge service data with shapes
    tracts_categorized = shapes >> select(_.calitp_itp_id, _.shape_id, _.tract_type)

    print(f"# obs in service: {len(service)}")
    print(f"# obs in tracts_categorized: {len(tracts_categorized)}")
    
    # We lose a bunch of observations here 
    gdf = pd.merge(service, 
                tracts_categorized, 
                on = ['calitp_itp_id', 'shape_id'],
                how = 'outer', 
                validate = "m:1",
                indicator=True
               )
    print("Merge results")
    print(gdf._merge.value_counts())
          
    keep_ids = [182]
    
    service2 = service[service.calitp_itp_id.isin(keep_ids)]
    tracts_categorized2 = tracts_categorized[tracts_categorized.calitp_itp_id.isin(keep_ids)]
    gdf2 = gdf[gdf.calitp_itp_id.isin(keep_ids)]
    
    print(keep_ids)
    print(f"# obs in service2: {len(service2)}")
    print(f"unique shape_id in service2: {service2.shape_id.nunique()}")
    
    print(f"# obs in tracts_categorized: {len(tracts_categorized2)}")
    print(f"unique shape_id in tracts_categorized2: {tracts_categorized2.shape_id.nunique()}")

    print(f"# obs in gdf2: {len(gdf2)}")
    print(f"unique shape_id in gdf2: {gdf2.shape_id.nunique()}")


In [None]:
merge_and_filter(f"{utils.GCS_FILE_PATH}")

In [None]:
merge_and_filter("./data/test/")

## Compare `shapes_initial`

Observations differ by too much.

Back up another step and go back to `shapes_initial`. `shapes_initial` is produced after stringing together bus stop sequence into route, which will now be replaced with a `geography_utils` function.

In [None]:
DATA_PATH = "./data/test/"
shapes_initial = gpd.read_parquet(f'{utils.GCS_FILE_PATH}shapes_initial.parquet')
my_shapes_initial = gpd.read_parquet(f'{DATA_PATH}shapes_initial.parquet')

In [None]:
len(shapes_initial)

In [None]:
shapes_initial.head(2)

In [None]:
len(my_shapes_initial)

In [None]:
my_shapes_initial.head(2)

In [None]:
# Compare a subset
eric_metro = shapes_initial[shapes_initial.calitp_itp_id==182]
tiff_metro = my_shapes_initial[my_shapes_initial.calitp_itp_id==182]

In [None]:
len(eric_metro)

In [None]:
len(tiff_metro)

In [None]:
m1 = pd.merge(
    eric_metro,
    tiff_metro,
    on = ["calitp_itp_id", "shape_id"],
    how = "outer",
    validate = "1:1",
    indicator=True
)

m1._merge.value_counts()

In [None]:
eric_metro.head()

Does `shape_id` change when we grab different dates? Well, we are grabbing the same date, but the `shapes.txt` file has changed, and since they've done a change, the `shape_id` has changed. But, still grabbing around 700s obs, which is good. 

Ideally, the `shape_id` value and geometry need to reflect the date of service that was grabbed (here, dates are in Oct).

In [None]:
display(eric_metro.head())
display(tiff_metro.head())

In [None]:
tiff_metro = tiff_metro.assign(
    length = tiff_metro.geometry.to_crs("EPSG:3310").length
)

tiff_metro[tiff_metro.shape_id.str.contains("910")]

## Compare `shape_frequency_funding`

Go back another step and see if the number of `shape_ids` and `route_ids` being grabbed are the same.

They're not...and seems like there's different numbers.

There shouldn't be, because calculating the mean runtimes and trips per hour shouldn't be massively dropping observations, except for edge cases of NaT. But, the number of routes operating for a Thursday for an operator and the number of trips it's making should be the same, because we aggregated up to that level (while adjusting the mean runtimes to get the correct average)

In [13]:
DATA_PATH = "./data/test/"

In [25]:
def compare_shape_frequency_funding(DATA_PATH):
    df = pd.read_parquet(f"{DATA_PATH}shape_frequency_funding.parquet")
    print(f"# obs: {len(df)}")
    
    group_cols = ["calitp_itp_id", "day_name"]
    df2 = df.groupby(group_cols).agg({"shape_id": "nunique"}).reset_index()
    print(f"# unique id-shape_id-day-name: {len(df2)}")
    
    group_cols = ["calitp_itp_id", "day_name"]
    df3 = df.groupby(group_cols).agg({"route_id": "nunique"}).reset_index()
    print(f"# unique id-route-id-day-name: {len(df2)}")

    check_ids = [182, 4, 279]
    print(f"check specific ids: {check_ids}")
    print("Unique shape_id-day_name")
    
    display(df2[df2.calitp_itp_id.isin(check_ids)])
    print("Unique route_id-day_name")
    display(df3[df3.calitp_itp_id.isin(check_ids)])
    
    return df2, df3

In [26]:
shape1, route1 = compare_shape_frequency_funding(f"{utils.GCS_FILE_PATH}")

# obs: 423216
# unique id-shape_id-day-name: 441
# unique id-route-id-day-name: 441
check specific ids: [182, 4, 279]
Unique shape_id-day_name


Unnamed: 0,calitp_itp_id,day_name,shape_id
0,4,Saturday,330
1,4,Sunday,330
2,4,Thursday,330
207,182,Saturday,734
208,182,Sunday,734
209,182,Thursday,734
327,279,Saturday,28
328,279,Sunday,28
329,279,Thursday,28


Unique route_id-day_name


Unnamed: 0,calitp_itp_id,day_name,route_id
0,4,Saturday,128
1,4,Sunday,128
2,4,Thursday,128
207,182,Saturday,118
208,182,Sunday,118
209,182,Thursday,118
327,279,Saturday,10
328,279,Sunday,10
329,279,Thursday,10


In [27]:
shape2, route2 = compare_shape_frequency_funding("./data/test/")

# obs: 95933
# unique id-shape_id-day-name: 363
# unique id-route-id-day-name: 363
check specific ids: [182, 4, 279]
Unique shape_id-day_name


Unnamed: 0,calitp_itp_id,day_name,shape_id
0,4,Saturday,150
1,4,Sunday,150
2,4,Thursday,323
171,182,Saturday,590
172,182,Sunday,590
173,182,Thursday,683
258,279,Saturday,21
259,279,Sunday,13
260,279,Thursday,24


Unique route_id-day_name


Unnamed: 0,calitp_itp_id,day_name,route_id
0,4,Saturday,59
1,4,Sunday,59
2,4,Thursday,128
171,182,Saturday,113
172,182,Sunday,111
173,182,Thursday,112
258,279,Saturday,10
259,279,Sunday,6
260,279,Thursday,10


In [32]:
shape_m1 = pd.merge(shape1, 
                    shape2,
                    on = ["calitp_itp_id", "day_name"],
                    how = "outer",
                    validate = "1:1",
                    indicator=True
                   )

shape_m1._merge.value_counts()

both          329
left_only     112
right_only     34
Name: _merge, dtype: int64

In [43]:
shape_m1[shape_m1._merge=="left_only"].calitp_itp_id.unique()

array([ 10,  11,  18,  21,  23,  34,  35,  36,  37,  42,  45,  49,  50,
        71,  77,  79,  81,  82,  83,  91,  93,  95,  98, 108, 121, 135,
       137, 142, 148, 154, 159, 162, 167, 168, 169, 187, 188, 192, 199,
       201, 204, 210, 213, 217, 220, 221, 235, 238, 239, 243, 251, 257,
       259, 261, 263, 264, 265, 294, 308, 315, 329, 334, 337, 339, 356,
       361, 366, 376, 386, 473])

In [44]:
shape_m1[shape_m1._merge=="right_only"].calitp_itp_id.unique()

array([ 48,  61, 106, 110, 127, 170, 208, 278, 280, 290, 346, 350])

In [53]:
assert (shape_m1.shape_id_x <= shape_m1.shape_id_y).all()

AssertionError: 

In [34]:
route_m1 = pd.merge(route1, 
                    route2,
                    on = ["calitp_itp_id", "day_name"],
                    how = "outer",
                    validate = "1:1",
                    indicator=True
                   )

route_m1._merge.value_counts()

both          329
left_only     112
right_only     34
Name: _merge, dtype: int64

In [46]:
route_m1[route_m1._merge=="left_only"].calitp_itp_id.unique()

array([ 10,  11,  18,  21,  23,  34,  35,  36,  37,  42,  45,  49,  50,
        71,  77,  79,  81,  82,  83,  91,  93,  95,  98, 108, 121, 135,
       137, 142, 148, 154, 159, 162, 167, 168, 169, 187, 188, 192, 199,
       201, 204, 210, 213, 217, 220, 221, 235, 238, 239, 243, 251, 257,
       259, 261, 263, 264, 265, 294, 308, 315, 329, 334, 337, 339, 356,
       361, 366, 376, 386, 473])

In [45]:
route_m1[route_m1._merge=="right_only"].calitp_itp_id.unique()

array([ 48,  61, 106, 110, 127, 170, 208, 278, 280, 290, 346, 350])

In [58]:
route_m1

Unnamed: 0,calitp_itp_id,day_name,route_id_x,route_id_y,_merge
0,4,Saturday,128.0,59.0,both
1,4,Sunday,128.0,59.0,both
2,4,Thursday,128.0,128.0,both
3,10,Saturday,1.0,,left_only
4,10,Sunday,1.0,,left_only
...,...,...,...,...,...
470,346,Sunday,,4.0,right_only
471,346,Thursday,,8.0,right_only
472,350,Saturday,,5.0,right_only
473,350,Sunday,,5.0,right_only


In [57]:
assert (route_m1.route_id_x <= route_m1.route_id_y).all()

AssertionError: 