# Dask Migration Guide

In [1]:
import dask.dataframe as dd
import dask
dask.__version__

In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


'2024.2.1'

In [7]:
# load Parquet file with 783 million row 

ddf = dd.read_parquet(
    "s3://coiled-datasets/uber-lyft-tlc/",
    filesystem='pyarrow',
)
ddf.head()

Unnamed: 0_level_0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,HV0003,B02867,B02867,2019-02-01 00:01:26,2019-02-01 00:02:55,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,2.45,...,0.83,0.0,,0.0,7.48,Y,N,N,N,
1,HV0003,B02879,B02879,2019-02-01 00:26:08,2019-02-01 00:41:29,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,1.71,...,0.7,0.0,,2.0,7.93,N,N,N,N,
2,HV0005,B02510,,2019-02-01 00:48:58,NaT,2019-02-01 00:51:34,2019-02-01 01:28:29,261,234,5.01,...,3.99,0.0,,0.0,35.970001,N,Y,N,N,
3,HV0005,B02510,,2019-02-01 00:02:15,NaT,2019-02-01 00:03:51,2019-02-01 00:07:16,87,87,0.34,...,0.64,0.0,,3.0,5.39,N,Y,N,N,
4,HV0005,B02510,,2019-02-01 00:06:17,NaT,2019-02-01 00:09:44,2019-02-01 00:39:56,87,198,6.84,...,2.16,0.0,,4.0,17.07,N,Y,N,N,


In [8]:
len(ddf)

783431901

In [6]:
io_config = IOConfig(s3=S3Config(anonymous=True))

df = daft.read_parquet(
    "s3://coiled-datasets/uber-lyft-tlc/",
    io_config=io_config,
)
df.show()

hvfhs_license_num Utf8,dispatching_base_num Utf8,originating_base_num Utf8,"request_datetime Timestamp(Microseconds, None)","on_scene_datetime Timestamp(Microseconds, None)","pickup_datetime Timestamp(Microseconds, None)","dropoff_datetime Timestamp(Microseconds, None)",PULocationID Int32,DOLocationID Int32,trip_miles Float32,trip_time Int32,base_passenger_fare Float32,tolls Float32,bcf Float32,sales_tax Float32,congestion_surcharge Float32,airport_fee Float32,tips Float32,driver_pay Float32,shared_request_flag Utf8,shared_match_flag Utf8,access_a_ride_flag Utf8,wav_request_flag Utf8,wav_match_flag Utf8,__null_dask_index__ Int64
HV0003,B02867,B02867,2019-02-01T00:01:26.000000,2019-02-01T00:02:55.000000,2019-02-01T00:05:18.000000,2019-02-01T00:14:57.000000,245,251,2.45,579,9.35,0,0.23,0.83,0,,0,7.48,Y,N,N,N,,0
HV0003,B02879,B02879,2019-02-01T00:26:08.000000,2019-02-01T00:41:29.000000,2019-02-01T00:41:29.000000,2019-02-01T00:49:39.000000,216,197,1.71,490,7.91,0,0.2,0.7,0,,2,7.93,N,N,N,N,,1
HV0005,B02510,,2019-02-01T00:48:58.000000,,2019-02-01T00:51:34.000000,2019-02-01T01:28:29.000000,261,234,5.01,2159,44.96,0,1.12,3.99,0,,0,35.97,N,Y,N,N,,2
HV0005,B02510,,2019-02-01T00:02:15.000000,,2019-02-01T00:03:51.000000,2019-02-01T00:07:16.000000,87,87,0.34,179,7.19,0,0.18,0.64,0,,3,5.39,N,Y,N,N,,3
HV0005,B02510,,2019-02-01T00:06:17.000000,,2019-02-01T00:09:44.000000,2019-02-01T00:39:56.000000,87,198,6.84,1799,24.25,0.11,0.61,2.16,0,,4,17.07,N,Y,N,N,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HV0003,B02764,B02764,2019-07-02T21:32:02.000000,2019-07-02T21:36:23.000000,2019-07-02T21:37:58.000000,2019-07-02T21:51:23.000000,13,50,4.25,806,19.61,0,0,1.7,2.75,,4.81,16.12,N,N,,N,,1068598
HV0003,B02764,B02764,2019-07-02T21:54:41.000000,2019-07-02T21:55:12.000000,2019-07-02T21:55:12.000000,2019-07-02T22:04:46.000000,230,246,1.83,574,11.44,0,0,0.99,2.75,,0,6.73,N,N,,N,,1068599
HV0005,B02510,,2019-07-02T21:17:17.000000,,2019-07-02T21:22:18.000000,2019-07-02T21:26:21.000000,149,149,0.806,243,6.92,0,0.15,0.55,0,,0,5.39,N,N,N,N,N,1068600
HV0005,B02510,,2019-07-02T21:47:01.000000,,2019-07-02T21:49:01.000000,2019-07-02T22:02:14.000000,150,21,3.352,793,14.36,0,0.26,0.93,0,,2.33,10.21,N,N,N,N,N,1068601


In [9]:
ddf = dd.read_parquet(
"s3://dask-data/nyc-taxi/nyc-2015.parquet/part.*.parquet",
storage_options={"anon": True},
)
ddf.head()

Unnamed: 0,tpep_pickup_datetime,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2015-01-01 00:00:00,1,2015-01-01 00:11:26,5,4.0,-73.971436,40.760201,1,N,-73.921181,40.768269,2,13.5,0.5,0.5,0.0,0.0,0.0,14.5
1,2015-01-01 00:00:00,2,2015-01-01 00:00:00,3,1.56,-74.00132,40.729057,1,N,-74.010208,40.719662,1,7.5,0.5,0.5,0.0,0.0,0.3,8.8
2,2015-01-01 00:00:00,2,2015-01-01 00:00:00,1,1.68,-73.991547,40.750069,1,N,0.0,0.0,2,10.0,0.0,0.5,0.0,0.0,0.3,10.8
3,2015-01-01 00:00:01,1,2015-01-01 00:03:49,1,0.8,-73.860847,40.757294,1,N,-73.868111,40.752285,2,5.0,0.5,0.5,0.0,0.0,0.0,6.3
4,2015-01-01 00:00:03,2,2015-01-01 00:21:48,2,2.57,-73.969017,40.754269,1,N,-73.994133,40.7616,2,14.5,0.5,0.5,0.0,0.0,0.3,15.8


In [3]:
import daft
from daft.io import IOConfig, S3Config
daft.__version__

'0.2.21'

In [None]:
io_config = IOConfig(s3=S3Config(anonymous=True))

df = daft.read_parquet(
    "s3://dask-data/nyc-taxi/nyc-2015.parquet/part.*.parquet",
    io_config=io_config,
)
df.show()

ScanWithTask-LocalLimit [Stage:6]:   0%|                                                          | 0/1 [00:00<?, ?it/s]

## Reading Data

In [2]:
# read 783 million row Parquet file
ddf = dd.read_parquet(
    "s3://coiled-datasets/uber-lyft-tlc/",
    filesystem='pyarrow',
)
ddf.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02867,B02867,2019-02-01 00:01:26,2019-02-01 00:02:55,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,2.45,...,0.83,0.0,,0.0,7.48,Y,N,N,N,
1,HV0003,B02879,B02879,2019-02-01 00:26:08,2019-02-01 00:41:29,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,1.71,...,0.7,0.0,,2.0,7.93,N,N,N,N,
2,HV0005,B02510,,2019-02-01 00:48:58,NaT,2019-02-01 00:51:34,2019-02-01 01:28:29,261,234,5.01,...,3.99,0.0,,0.0,35.970001,N,Y,N,N,
3,HV0005,B02510,,2019-02-01 00:02:15,NaT,2019-02-01 00:03:51,2019-02-01 00:07:16,87,87,0.34,...,0.64,0.0,,3.0,5.39,N,Y,N,N,
4,HV0005,B02510,,2019-02-01 00:06:17,NaT,2019-02-01 00:09:44,2019-02-01 00:39:56,87,198,6.84,...,2.16,0.0,,4.0,17.07,N,Y,N,N,


In [3]:
# 720 partitions
ddf

Unnamed: 0_level_0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
npartitions=720,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
,string,string,string,datetime64[us],datetime64[us],datetime64[us],datetime64[us],int32,int32,float32,int32,float32,float32,float32,float32,float32,float32,float32,float32,category[unknown],category[unknown],category[unknown],category[unknown],category[unknown]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [4]:
# dask uses an index
ddf.index

Dask Index Structure:
npartitions=720
    int64
      ...
    ...  
      ...
      ...
dtype: int64
Dask Name: read-parquet-index, 2 graph layers

In [5]:
# daft predicate filtering
# note pandas-like selection
ddf[ddf.trip_miles < 3].head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02867,B02867,2019-02-01 00:01:26,2019-02-01 00:02:55,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,2.45,...,0.83,0.0,,0.0,7.48,Y,N,N,N,
1,HV0003,B02879,B02879,2019-02-01 00:26:08,2019-02-01 00:41:29,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,1.71,...,0.7,0.0,,2.0,7.93,N,N,N,N,
3,HV0005,B02510,,2019-02-01 00:02:15,NaT,2019-02-01 00:03:51,2019-02-01 00:07:16,87,87,0.34,...,0.64,0.0,,3.0,5.39,N,Y,N,N,
5,HV0005,B02510,,2019-02-01 00:56:01,NaT,2019-02-01 00:59:55,2019-02-01 01:06:28,198,198,1.11,...,0.51,0.0,,0.0,0.0,Y,Y,N,N,
8,HV0003,B02867,B02867,2019-02-01 00:00:35,2019-02-01 00:09:33,2019-02-01 00:10:48,2019-02-01 00:20:23,226,260,1.59,...,0.62,0.0,,0.0,6.51,Y,N,N,N,


For this computation, Dask loads in all of the data and only applies the filter afterwards. It does not have Query Optimization. NB: working on it but at the moment unstable. Also Dask is pure Python so query engine will not have the C/C++ performance of Rust.

Daft's Query Optimization means that the filter is pushed to the read_parquet call, which means that Daft knows it only needs to look at the `trip_miles` column, instead of reading all 24 columns.

In [6]:
import daft
from daft.io import IOConfig, S3Config

In [7]:
# df = daft.read_parquet("yellow_tripdata_2023-12.parquet")
# df.show()

io_config = IOConfig(s3=S3Config(anonymous=True))

df = daft.read_parquet(
    "s3://coiled-datasets/uber-lyft-tlc/",
    io_config=io_config,
)
df.show()

hvfhs_license_num Utf8,dispatching_base_num Utf8,originating_base_num Utf8,"request_datetime Timestamp(Microseconds, None)","on_scene_datetime Timestamp(Microseconds, None)","pickup_datetime Timestamp(Microseconds, None)","dropoff_datetime Timestamp(Microseconds, None)",PULocationID Int32,DOLocationID Int32,trip_miles Float32,trip_time Int32,base_passenger_fare Float32,tolls Float32,bcf Float32,sales_tax Float32,congestion_surcharge Float32,airport_fee Float32,tips Float32,driver_pay Float32,shared_request_flag Utf8,shared_match_flag Utf8,access_a_ride_flag Utf8,wav_request_flag Utf8,wav_match_flag Utf8,__null_dask_index__ Int64
HV0003,B02867,B02867,2019-02-01T00:01:26.000000,2019-02-01T00:02:55.000000,2019-02-01T00:05:18.000000,2019-02-01T00:14:57.000000,245,251,2.45,579,9.35,0,0.23,0.83,0,,0,7.48,Y,N,N,N,,0
HV0003,B02879,B02879,2019-02-01T00:26:08.000000,2019-02-01T00:41:29.000000,2019-02-01T00:41:29.000000,2019-02-01T00:49:39.000000,216,197,1.71,490,7.91,0,0.2,0.7,0,,2,7.93,N,N,N,N,,1
HV0005,B02510,,2019-02-01T00:48:58.000000,,2019-02-01T00:51:34.000000,2019-02-01T01:28:29.000000,261,234,5.01,2159,44.96,0,1.12,3.99,0,,0,35.97,N,Y,N,N,,2
HV0005,B02510,,2019-02-01T00:02:15.000000,,2019-02-01T00:03:51.000000,2019-02-01T00:07:16.000000,87,87,0.34,179,7.19,0,0.18,0.64,0,,3,5.39,N,Y,N,N,,3
HV0005,B02510,,2019-02-01T00:06:17.000000,,2019-02-01T00:09:44.000000,2019-02-01T00:39:56.000000,87,198,6.84,1799,24.25,0.11,0.61,2.16,0,,4,17.07,N,Y,N,N,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HV0003,B02764,B02764,2019-07-02T21:32:02.000000,2019-07-02T21:36:23.000000,2019-07-02T21:37:58.000000,2019-07-02T21:51:23.000000,13,50,4.25,806,19.61,0,0,1.7,2.75,,4.81,16.12,N,N,,N,,1068598
HV0003,B02764,B02764,2019-07-02T21:54:41.000000,2019-07-02T21:55:12.000000,2019-07-02T21:55:12.000000,2019-07-02T22:04:46.000000,230,246,1.83,574,11.44,0,0,0.99,2.75,,0,6.73,N,N,,N,,1068599
HV0005,B02510,,2019-07-02T21:17:17.000000,,2019-07-02T21:22:18.000000,2019-07-02T21:26:21.000000,149,149,0.806,243,6.92,0,0.15,0.55,0,,0,5.39,N,N,N,N,N,1068600
HV0005,B02510,,2019-07-02T21:47:01.000000,,2019-07-02T21:49:01.000000,2019-07-02T22:02:14.000000,150,21,3.352,793,14.36,0,0.26,0.93,0,,2.33,10.21,N,N,N,N,N,1068601


In [8]:
# predicate filtering
df.where(daft.col("trip_miles") < 3).show()

hvfhs_license_num Utf8,dispatching_base_num Utf8,originating_base_num Utf8,"request_datetime Timestamp(Microseconds, None)","on_scene_datetime Timestamp(Microseconds, None)","pickup_datetime Timestamp(Microseconds, None)","dropoff_datetime Timestamp(Microseconds, None)",PULocationID Int32,DOLocationID Int32,trip_miles Float32,trip_time Int32,base_passenger_fare Float32,tolls Float32,bcf Float32,sales_tax Float32,congestion_surcharge Float32,airport_fee Float32,tips Float32,driver_pay Float32,shared_request_flag Utf8,shared_match_flag Utf8,access_a_ride_flag Utf8,wav_request_flag Utf8,wav_match_flag Utf8,__null_dask_index__ Int64
HV0003,B02867,B02867,2019-02-01T00:01:26.000000,2019-02-01T00:02:55.000000,2019-02-01T00:05:18.000000,2019-02-01T00:14:57.000000,245,251,2.45,579,9.35,0,0.23,0.83,0,,0,7.48,Y,N,N,N,,0
HV0003,B02879,B02879,2019-02-01T00:26:08.000000,2019-02-01T00:41:29.000000,2019-02-01T00:41:29.000000,2019-02-01T00:49:39.000000,216,197,1.71,490,7.91,0,0.2,0.7,0,,2,7.93,N,N,N,N,,1
HV0005,B02510,,2019-02-01T00:02:15.000000,,2019-02-01T00:03:51.000000,2019-02-01T00:07:16.000000,87,87,0.34,179,7.19,0,0.18,0.64,0,,3,5.39,N,Y,N,N,,3
HV0005,B02510,,2019-02-01T00:56:01.000000,,2019-02-01T00:59:55.000000,2019-02-01T01:06:28.000000,198,198,1.11,359,5.75,0,0.14,0.51,0,,0,0,Y,Y,N,N,,5
HV0003,B02867,B02867,2019-02-01T00:00:35.000000,2019-02-01T00:09:33.000000,2019-02-01T00:10:48.000000,2019-02-01T00:20:23.000000,226,260,1.59,574,6.99,0,0.17,0.62,0,,0,6.51,Y,N,N,N,,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HV0003,B02872,B02872,2019-07-02T21:11:09.000000,2019-07-02T21:12:51.000000,2019-07-02T21:13:42.000000,2019-07-02T21:18:00.000000,123,178,0.88,258,5.73,0,0,0.5,0,,0,5.39,N,N,,N,,1068602
HV0003,B02872,B02872,2019-07-02T21:37:48.000000,2019-07-02T21:39:47.000000,2019-07-02T21:41:34.000000,2019-07-02T21:43:07.000000,29,29,0.36,92,7.37,0,0,0.64,0,,0,5.39,N,N,,N,,1068603
HV0003,B02879,B02879,2019-07-02T21:06:42.000000,2019-07-02T21:08:25.000000,2019-07-02T21:10:11.000000,2019-07-02T21:19:35.000000,74,42,1.56,565,8.8,0,0,0.76,0,,0,6.37,N,N,,N,,1068605
HV0003,B02617,B02617,2019-07-02T21:19:37.000000,2019-07-02T21:19:56.000000,2019-07-02T21:22:32.000000,2019-07-02T21:36:08.000000,181,188,2.5,817,11.39,0,0,0.99,0,,0,9.48,N,N,,N,,1068606


## Basic Operations

In [20]:
# sort
ddf.sort_values("trip_miles").head()

KeyboardInterrupt: 

This operation takes >20mins with Dask.

In [None]:
df.sort(daft.col("trip_miles"), desc=False).show(5)

This operation kills the kernel immediately with Daft.

In [None]:
# groupby

In [None]:
# join

In [None]:
# expressions

## ML Workloads

## Going Distributed