# Training a Torch Image Classifier

In [None]:
# Requirements
# !pip install 'ray[air]'
# !pip install requests torch torchvision

## Load and normalize CIFAR-10

In [1]:
import ray
import torchvision
import torchvision.transforms as transforms

train_dataset = torchvision.datasets.CIFAR10("data", download=True, train=True)
test_dataset = torchvision.datasets.CIFAR10("data", download=True, train=False)

train_dataset: ray.data.Dataset = ray.data.from_torch(train_dataset)
test_dataset: ray.data.Dataset = ray.data.from_torch(test_dataset)

Files already downloaded and verified
Files already downloaded and verified


2023-09-05 17:06:24,781	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Next, let’s represent our data using a dictionary of ndarrays instead of tuples. 

In [2]:
from typing import Dict, Tuple
import numpy as np
from PIL.Image import Image
import torch


def convert_batch_to_numpy(batch) -> Dict[str, np.ndarray]:
    images = np.stack([np.array(image) for image, _ in batch["item"]])
    labels = np.array([label for _, label in batch["item"]])
    return {"image": images, "label": labels}


train_dataset = train_dataset.map_batches(convert_batch_to_numpy).materialize()
test_dataset = test_dataset.map_batches(convert_batch_to_numpy).materialize()

2023-09-05 17:06:49,828	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(convert_batch_to_numpy)]
2023-09-05 17:06:49,828	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-09-05 17:06:49,829	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

2023-09-05 17:06:50,857	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(convert_batch_to_numpy)]
2023-09-05 17:06:50,857	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-09-05 17:06:50,858	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

In [3]:
train_dataset

MaterializedDataset(
   num_blocks=200,
   num_rows=50000,
   schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8), label: int64}
)