# Processing NYC taxi data using Ray Data

The NYC Taxi dataset is a popular tabular dataset. In this example, we demonstrate some basic data processing on this dataset using Ray Data.

Source: https://docs.ray.io/en/latest/data/examples/nyc_taxi_basic_processing.html

In [1]:
import logging, os, random, warnings
import ray

warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

if ray.is_initialized:
    ray.shutdown()
ray.init(logging_level=logging.ERROR)
ray.data.DataContext.get_current().execution_options.verbose_progress = True

## Read datasets

In [2]:
ds = ray.data.read_parquet("data/yellow_tripdata_2023-02.parquet")

(pid=181352) Parquet Files Sample 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
ds.schema()

Column                 Type
------                 ----
VendorID               int32
tpep_pickup_datetime   timestamp[us]
tpep_dropoff_datetime  timestamp[us]
passenger_count        int64
trip_distance          double
RatecodeID             int64
store_and_fwd_flag     large_string
PULocationID           int32
DOLocationID           int32
payment_type           int64
fare_amount            double
extra                  double
mta_tax                double
tip_amount             double
tolls_amount           double
improvement_surcharge  double
total_amount           double
congestion_surcharge   double
Airport_fee            double

In [4]:
ds.count()

2913955

In [5]:
ds.take(1)

2023-09-06 20:56:31,875	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet->SplitBlocks(200)]
2023-09-06 20:56:31,876	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=True)


- ReadParquet->SplitBlocks(200) 1:   0%|          | 0/200 [00:00<?, ?it/s]

Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

[{'VendorID': 1,
  'tpep_pickup_datetime': datetime.datetime(2023, 2, 1, 0, 32, 53),
  'tpep_dropoff_datetime': datetime.datetime(2023, 2, 1, 0, 34, 34),
  'passenger_count': 2,
  'trip_distance': 0.3,
  'RatecodeID': 1,
  'store_and_fwd_flag': 'N',
  'PULocationID': 142,
  'DOLocationID': 163,
  'payment_type': 2,
  'fare_amount': 4.4,
  'extra': 3.5,
  'mta_tax': 0.5,
  'tip_amount': 0.0,
  'tolls_amount': 0.0,
  'improvement_surcharge': 1.0,
  'total_amount': 9.4,
  'congestion_surcharge': 2.5,
  'Airport_fee': 0.0}]

In [6]:
print(f"memory in disk: {ds.size_bytes()/1024**2:.2f} MB")
print(f"memory in memory: {ds.materialize().size_bytes()/1024**2:.2f} MB")

memory in disk: 398.43 MB


Read progress 0:   0%|          | 0/1 [00:00<?, ?it/s]

memory in memory: 391.81 MB


## Data Exploration and Cleaning

In [7]:
# What's the longets trip distance, largest tip amount, and most number of passengers?
ds.max(["trip_distance", "tip_amount", "passenger_count"])

2023-09-06 20:56:33,669	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet->SplitBlocks(200)] -> AllToAllOperator[Aggregate]
2023-09-06 20:56:33,670	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=True)


- ReadParquet->SplitBlocks(200) 1:   0%|          | 0/200 [00:00<?, ?it/s]

- Aggregate 2:   0%|          | 0/200 [00:00<?, ?it/s]

Shuffle Map 3:   0%|          | 0/200 [00:00<?, ?it/s]

Shuffle Reduce 4:   0%|          | 0/200 [00:00<?, ?it/s]

Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

{'max(trip_distance)': 335004.33,
 'max(tip_amount)': 482.9,
 'max(passenger_count)': 9}

In [8]:
# Drop some columns.
ds = ds.drop_columns(["store_and_fwd_flag", "mta_tax"])

In [9]:
# ds.groupby("passenger_count").count().take()

In [10]:
# Filter our records with negative passenger counts.
ds = ds.map_batches(lambda df: df[df["passenger_count"] > 0])

In [12]:
# Mean trip distance grouped by passenger count.
ds.groupby("passenger_count").mean("trip_distance").take()

2023-09-06 20:59:45,374	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet->SplitBlocks(200)] -> TaskPoolMapOperator[MapBatches(<lambda>)->MapBatches(<lambda>)] -> LimitOperator[limit=1]
2023-09-06 20:59:45,375	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=True)


- ReadParquet->SplitBlocks(200) 1:   0%|          | 0/200 [00:00<?, ?it/s]

- MapBatches(<lambda>)->MapBatches(<lambda>) 2:   0%|          | 0/200 [00:00<?, ?it/s]

- limit=1 3:   0%|          | 0/1 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(MapBatches(<lambda>)->MapBatches(<lambda>) pid=181356)[0m Task failed with retryable exception: TaskID(f3bc455c6d4ed1faffffffffffffffffffffffff01000000).
[2m[36m(MapBatches(<lambda>)->MapBatches(<lambda>) pid=181356)[0m Traceback (most recent call last):
[2m[36m(MapBatches(<lambda>)->MapBatches(<lambda>) pid=181356)[0m   File "python/ray/_raylet.pyx", line 1191, in ray._raylet.execute_dynamic_generator_and_store_task_outputs
[2m[36m(MapBatches(<lambda>)->MapBatches(<lambda>) pid=181356)[0m   File "python/ray/_raylet.pyx", line 3684, in ray._raylet.CoreWorker.store_task_outputs
[2m[36m(MapBatches(<lambda>)->MapBatches(<lambda>) pid=181356)[0m   File "/home/dino/anaconda3/envs/ray/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task
[2m[36m(MapBatches(<lambda>)->MapBatches(<lambda>) pid=181356)[0m     for b_out in fn(iter(blocks), ctx):
[2m[36m(MapBatches(<lambda>)->MapBatches(<lambda>) pid=181356)[0m   F

RayTaskError(TypeError): [36mray::MapBatches(<lambda>)->MapBatches(<lambda>)()[39m (pid=181368, ip=192.168.1.147)
  File "/home/dino/anaconda3/envs/ray/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 415, in _map_task
    for b_out in fn(iter(blocks), ctx):
  File "/home/dino/anaconda3/envs/ray/lib/python3.9/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 76, in do_map
    yield from transform_fn(blocks, ctx, *fn_args, **fn_kwargs)
  File "/home/dino/anaconda3/envs/ray/lib/python3.9/site-packages/ray/data/_internal/planner/map_batches.py", line 118, in fn
    yield from process_next_batch(batch)
  File "/home/dino/anaconda3/envs/ray/lib/python3.9/site-packages/ray/data/_internal/planner/map_batches.py", line 79, in process_next_batch
    batch = batch_fn(batch, *fn_args, **fn_kwargs)
  File "/tmp/ipykernel_180920/2032168200.py", line 2, in <lambda>
TypeError: unhashable type: 'numpy.ndarray'

In [13]:
ray.shutdown()