In [86]:

import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

In [129]:
from shared_utils import calitp_color_palette as cp
from shared_utils import geography_utils, styleguide, utils
import altair as alt

In [87]:
import intake
catalog = intake.open_catalog("./catalog_threshold.yml")

In [88]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [89]:
def clean_trips():
    df = catalog.trips.read()

    subset = [
        "feed_key",
        "route_id",
        "direction_id",
        "shape_id",
    ]

    df = df[subset]

    df = df.drop_duplicates().reset_index(drop=True)

    return df

In [90]:
trips = clean_trips()

In [91]:
def clean_routelines():
    df = catalog.route_lines.read()

    # Drop shape array key? Drop n_trips?
    df = df.drop(columns=["shape_array_key"])
    df = (df.drop_duplicates()).reset_index(drop=True)

    # Calculate length of geometry
    df = df.assign(actual_route_length=(df.geometry.length))

    return df

In [92]:
routelines = clean_routelines()

In [93]:
routelines.drop(columns = ['geometry']).sample(5)

Unnamed: 0,feed_key,shape_id,n_trips,actual_route_length
3440,b1d9c45c22397c2873483e50344107a8,p_786778,8,4860.11
3429,57d7a160e4588225238b330da8453912,353,22,5373.68
106,6adf6cd9b6d24ab4ee8ee220e3697a73,1620007_DEC22,33,31546.59
1449,c2678c0ae3a6821a47f3cab62911c33d,7701,18,14324.37
3975,57d7a160e4588225238b330da8453912,3150,6,5831.51


In [94]:
# df = catalog.route_lines.read()

In [95]:
# df.shape

In [96]:
def clean_longest_shape():
    df = catalog.longest_shape.read()

    df = df.rename(columns={"route_length": "longest_route_length"})

    return df

In [97]:
longest_shape = clean_longest_shape()

In [98]:
# Drop name?

crosswalk = catalog.crosswalk.read()

In [99]:
trips.sample()

Unnamed: 0,feed_key,route_id,direction_id,shape_id
3536,801241dd580ee0f9bddc59b7abe52929,9,0.0,9_0_122


In [100]:
m1 = trips.merge(crosswalk, how="inner", on=["feed_key", "route_id", "direction_id"])

In [101]:
m1.sample()

Unnamed: 0,feed_key,route_id,direction_id,shape_id,name,gtfs_dataset_key,route_dir_identifier
32,f0b28ce2e70bba09a56e0b6a25268666,51,1.0,37,Pasadena Schedule,3a508d597e0f71941bd18367262a9cbc,3353335113


In [102]:
# routelines.sample()

In [103]:
m2 = m1.merge(routelines, how="inner", on=["feed_key", "shape_id"])

In [104]:
longest_shape.columns

Index(['geometry', 'feed_key', 'name', 'route_id', 'direction_id',
       'longest_shape_id', 'route_dir_identifier', 'longest_route_length',
       'segment_sequence', 'geometry_arrowized', 'gtfs_dataset_key'],
      dtype='object')

In [105]:
m2.columns

Index(['feed_key', 'route_id', 'direction_id', 'shape_id', 'name',
       'gtfs_dataset_key', 'route_dir_identifier', 'n_trips', 'geometry',
       'actual_route_length'],
      dtype='object')

In [106]:
m3 = m2.merge(longest_shape.drop(columns = ["geometry"]), how="inner", on=["feed_key", "gtfs_dataset_key", "direction_id",
                                                                          "route_id","route_dir_identifier", "name"])

In [107]:
m3["route_length_percentage"] = (
        (m3.actual_route_length / m3.longest_route_length) * 100
    ).astype(int)

In [109]:
m3.sample(3).drop(columns = ['geometry','geometry_arrowized'])

Unnamed: 0,feed_key,route_id,direction_id,shape_id,name,gtfs_dataset_key,route_dir_identifier,n_trips,actual_route_length,longest_shape_id,longest_route_length,segment_sequence,route_length_percentage
47945,cde080d02841ea62ccfa158c22aa814e,4867,0.0,19752,LA DOT Schedule,02af2d11f5bd44434c581540e9e857d8,3106299841,40,7567.34,19752,7567.34,7,100
123018,b9a0bc47bcf6590c67677f3af358f490,51H,0.0,107292,Bay Area 511 Santa Clara Transit Schedule,03cadbffef6f3cd4ce839b218bc65e82,725824661,2,16251.01,107293,17854.84,1,91
90951,6adf6cd9b6d24ab4ee8ee220e3697a73,128-13167,0.0,1280029_DEC22,LA Metro Bus Schedule,7c48cc5675c9266fb6851070489c53e9,1757433343,16,22751.43,1280029_DEC22,22751.43,19,100


In [110]:
m3.gtfs_dataset_key.nunique(), m3.name.nunique()

(84, 83)

In [116]:
# m3.name.unique()

In [114]:
m4 = (
        m3.groupby(
            [
                "route_id",
                "name",
                "gtfs_dataset_key",
                "route_dir_identifier",
                "shape_id",
                "longest_shape_id",
                "route_length_percentage",
            ]
        )
        .agg({"segment_sequence": "count"})
        .rename(columns={"segment_sequence": "total_segments"})
        .reset_index()
    )

In [117]:
m3.shape, m4.shape

((130196, 15), (4982, 8))

In [125]:
ac = m4.loc[m4.name == 'Bay Area 511 AC Transit Schedule'].reset_index(drop=True)

In [126]:
ac.shape

(323, 8)

In [128]:
ac= (
        ac.groupby(["name", "route_id", "shape_id"])
        .agg({"route_length_percentage": "max"})
        .reset_index()
    )

In [131]:
(
        alt.Chart(ac)
        .mark_boxplot(extent="min-max")
        .encode(
            x="route_id:N",
            y="route_length_percentage:Q",
            color=alt.Color(
                "route_id",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
        )
    ).properties(title=f"AC Test")

