## Update `time_series_utils` and `dask_utils` with `dask.from_map` instead of delayed

`time_series_utils` concatenating a bunch of months using `dask.delayed` is taking longer to do so. For segment geometries, this gdf is getting really big and using a lot of memory. In reality, we don't need to full gdf, we want to look across many dates and then dedupe.

Dask delayed docs mentions the use of `from_map` as a way to read in parquets and do something with it.

Dask docs: https://docs.dask.org/en/latest/generated/dask.dataframe.from_map.html

Tutorial: https://blog.dask.org/2023/04/12/from-map

In [None]:
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd

from typing import Literal

from segment_speed_utils.project_vars import GTFS_DATA_DICT, SEGMENT_GCS
from shared_utils import rt_dates

analysis_date_list = rt_dates.y2024_dates

segment_type = "stop_segments"

In [None]:
def func(
    path: str,
    one_date: str, 
    data_type: Literal["df", "gdf"] = "df",
    **kwargs, 
):
    if data_type == "gdf":
        
        df = gpd.read_parquet(
            f"{path}_{one_date}.parquet", 
            **kwargs,
        ).drop_duplicates()

    else:
        df = pd.read_parquet(
            f"{path}_{one_date}.parquet", 
            **kwargs,
        ).drop_duplicates()
    
    return df

def get_ddf(paths, analysis_date_list, data_type, **kwargs):

    return dd.from_map(
        func, paths, 
        analysis_date_list, 
        data_type = data_type, 
        **kwargs
    ).drop_duplicates()

In [None]:
segment_file = GTFS_DATA_DICT[segment_type]["segments_file"]
segment_cols = ["schedule_gtfs_dataset_key", "route_id", "geometry"]

segment_paths = [f"{SEGMENT_GCS}{segment_file}" for date in analysis_date_list]

segment_paths

In [None]:
segment_gddf = get_ddf(
    segment_paths, 
    analysis_date_list, 
    data_type = "gdf",
    columns = segment_cols
)  

In [None]:
segment_gddf

In [None]:
segment_gddf.compute()

In [None]:
speed_file = GTFS_DATA_DICT[segment_type]["route_dir_single_segment"]
speed_cols = ["schedule_gtfs_dataset_key", "route_id"]

speed_paths = [f"{SEGMENT_GCS}{speed_file}" for date in analysis_date_list]
speed_paths

In [None]:
speed_ddf = get_ddf(
    speed_paths, 
    analysis_date_list, 
    data_type = "df",
    columns = speed_cols
)  

In [None]:
speed_df = speed_ddf.compute()
print(speed_df.shape)

In [None]:
def concatenate_datasets_across_dates(
    gcs_bucket: str,
    dataset_name: str,
    date_list: list,
    data_type: Literal["df", "gdf"],
    get_pandas: bool = True,
    **kwargs
) -> pd.DataFrame:
    """
    Concatenate parquets across all months of available data.
    """  
    paths = [f"{gcs_bucket}{dataset_name}" for date in date_list]

    df = get_ddf(
        paths, 
        date_list, 
        data_type = data_type,
        **kwargs
    )  
    if get_pandas:
        df = df.compute()
    
    return df

In [None]:
speed_ddf2 = concatenate_datasets_across_dates(
    SEGMENT_GCS,
    speed_file,
    analysis_date_list,
    data_type = "df",
    columns = speed_cols,
    get_pandas = False,
)

In [None]:
speed_df2 = speed_ddf2.compute()
print(speed_df2.shape)

In [None]:
speed_df3 = concatenate_datasets_across_dates(
    SEGMENT_GCS,
    speed_file,
    analysis_date_list,
    data_type = "df",
    get_pandas = True,
    columns = speed_cols,
)

speed_df3.shape

In [None]:
from segment_speed_utils import time_series_utils

speed_df4 = time_series_utils.concatenate_datasets_across_dates(
    SEGMENT_GCS,
    speed_file,
    analysis_date_list,
    data_type = "df",
    get_pandas = True,
    columns = speed_cols,
)

speed_df4.shape

In [None]:
speed_df4