In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import gcsfs
import geopandas as gpd
import numpy as np
import pandas as pd

from dask import delayed, compute
from shared_utils import rt_utils

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
DASK_TEST = f"{GCS_FILE_PATH}dask_test/"
COMPILED_CACHED_VIEWS = f"{GCS_FILE_PATH}rt_delay/compiled_cached_views/"

analysis_date = "2022-10-12"
fs = gcsfs.GCSFileSystem()



In [None]:
'''
trip_stats_files = fs.ls(f"{DASK_TEST}trip_diagnostics/")

full_df = pd.DataFrame()

for f in trip_stats_files:
    df = pd.read_parquet(f"gs://{f}")
    full_df = pd.concat([full_df, df], axis=0)

full_df = full_df.reset_index(drop=True)

full_df.to_parquet(f"{DASK_TEST}trip_diagnostics_{analysis_date}.parquet")
'''

In [2]:
def merge_trip_diagnostics_with_total_segments():
    trip_diagnostics = pd.read_parquet(
        f"{DASK_TEST}trip_diagnostics_{analysis_date}.parquet", 
        filters = [[("calitp_itp_id", "==", 300)]]
    )
    
    
    segments = gpd.read_parquet(
        f"{DASK_TEST}longest_shape_segments.parquet")
    
    total_segments_by_shape = (segments.groupby(
            ["calitp_itp_id", "route_dir_identifier"])
            .segment_sequence.nunique()
            .reset_index()
            .rename(columns = {"segment_sequence": "total_segments"})
           )
    
    df = pd.merge(
        trip_diagnostics,
        total_segments_by_shape,
        on = ["calitp_itp_id", "route_dir_identifier"],
        how = "inner",
        validate = "m:1",
    )
    
    df = df.assign(
        pct_vp_segments = df.num_segments_with_vp.divide(df.total_segments),
        trip_time = (df.trip_end - df.trip_start) / np.timedelta64(1, 's'),
        total_trips = df.groupby("calitp_itp_id").trip_id.transform("nunique"),
    )
    
    return df

In [3]:
df = merge_trip_diagnostics_with_total_segments()

In [10]:
def summary_valid_trips_by_cutoff(
    df, time_cutoffs: list, segment_cutoffs: list): 

    final = pd.DataFrame()

    for t in time_cutoffs:
        for s in segment_cutoffs:
            valid = (df[(df.trip_time >= t) & (df.pct_vp_segments >= s)]
                     .groupby(["calitp_itp_id", "total_trips"])
                     .trip_id.nunique()
                     .reset_index()
                     .rename(columns = {"trip_id": "n_trips"})
                    )

            valid = valid.assign(
                trip_cutoff = t,
                segment_cutoff = s,
                cutoff = f"{t}+ min & {s*100}%+ segments"
            )

            final = pd.concat([final, valid], axis=0)
    
    
    final = final.assign(
        pct_usable_trips = final.n_trips.divide(final.total_trips)
    )
    
    return final

In [11]:
TIME_CUTOFFS = [5, 10, 15]
SEGMENT_CUTOFFS = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75]

valid_stats = summary_valid_trips_by_cutoff(
    df, TIME_CUTOFFS, SEGMENT_CUTOFFS)

In [20]:
import altair as alt
from shared_utils import styleguide
from shared_utils import calitp_color_palette as cp

In [23]:
chart = (alt.Chart(valid_stats)
 .mark_bar()
 .encode(
     x=alt.X("pct_usable_trips:Q", title="% usable trips"),
     y=alt.Y("cutoff:N", title="cutoffs used",
             sort=alt.SortField(
                 "pct_usable_trips", order="descending")),
 )
)

styleguide.preset_chart_config(chart)

In [None]:
#results_ddfs = [compute(i)[0] for i in results]

In [None]:
ddf = (dd.multi.concat(results_ddfs, axis=0)
       .repartition(npartitions = 3)
      )

In [None]:
ddf.to_parquet("./data/trip_stats/")

In [None]:
df = pd.read_parquet("./data/trip_stats.parquet")

In [None]:
#trip_stats_df = get_trip_stats(delayed_df)
delayed_dfs = [import_data(f).persist() for f in vp_seg_files]

In [None]:
trip_stats_dfs = [get_trip_stats(df) for df in delayed_dfs]

In [None]:
trip_stats_computes = [compute(i)[0] for i in trip_stats_dfs]

In [None]:
result_ddf = dd.multi.concat(trip_stats_computes, axis=0)

In [None]:
result_ddf = result_ddf.repartition(npartitions=3)
result_ddf.to_parquet("./data/trip_stats/")

In [None]:
ddf = dd.from_delayed(delayed_dfs)
# this needs to take pandas dfs?

In [None]:
segments = dg.read_parquet(f"{DASK_TEST}longest_shape_segments.parquet")
total_segments_by_shape = (segments.groupby(["calitp_itp_id", "route_dir_identifier"])
            .segment_sequence.nunique()
            .reset_index()
            .rename(columns = {"segment_sequence": "total_segments"})
           )


In [None]:
trip_stats.visualize()

In [None]:
trip_stats.compute()