In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import ray
import warnings
warnings.filterwarnings("ignore")

In [None]:
from doggos.utils import add_class

In [None]:
# Load data
ds = ray.data.read_images(
    "s3://doggos-dataset/train", 
    include_paths=True, 
    shuffle="files",
)
ds = ds.map(add_class)  # add class

2025-02-19 21:46:16,045	INFO worker.py:1636 -- Connecting to existing Ray cluster at address: 10.0.55.44:6379...
2025-02-19 21:46:16,054	INFO worker.py:1812 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-3apyclh1wkjd883fht6vsy8xjh.i.anyscaleuserdata.com [39m[22m
2025-02-19 21:46:16,061	INFO packaging.py:393 -- Pushing file package 'gcs://_ray_pkg_38952b6614b975cd0034889cea16170ba2a3be71.zip' (2.86MiB) to Ray cluster...
2025-02-19 21:46:16,088	INFO packaging.py:406 -- Successfully pushed file package 'gcs://_ray_pkg_38952b6614b975cd0034889cea16170ba2a3be71.zip'.


### Data Preprocessing

<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- overall diagram for this section (more detailed than overall e2e)

In [None]:
from doggos.embed import EmbeddingGenerator

In [None]:
class Preprocessor:
    """Preprocessor class."""
    def __init__(self, class_to_label={}):
        self.class_to_label = class_to_label or {}  # mutable defaults
        self.label_to_class = {v: k for k, v in self.class_to_label.items()}
        
    def fit(self, ds, column):
        self.classes = ds.unique(column=column)
        self.class_to_label = {tag: i for i, tag in enumerate(self.classes)}
        self.label_to_class = {v: k for k, v in self.class_to_label.items()}
        return self

    def convert_to_label(self, row, class_to_label):
        if "class" in row:
            row["label"] = class_to_label[row["class"]]
        return row
    
    def transform(self, ds, concurrency=4, batch_size=64, num_gpus=1):
        ds = ds.map(
            self.convert_to_label, 
            fn_kwargs={"class_to_label": self.class_to_label},
        )
        ds = ds.map_batches(
            EmbeddingGenerator,
            fn_constructor_kwargs={"model_id": "openai/clip-vit-base-patch32"},
            fn_kwargs={"device": "cuda"}, 
            concurrency=concurrency, 
            batch_size=batch_size,
            num_gpus=num_gpus,
        )
        ds = ds.drop_columns(["image"])
        return ds

    def save(self, fp):
        with open(fp, "w") as f:
            json.dump(self.class_to_label, f)

In [None]:
# Preprocess
preprocessor = Preprocessor()
preprocessor = preprocessor.fit(ds, column="class")
ds = preprocessor.transform(ds=ds)
ds.take(1)

2025-02-19 21:46:18,232	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-19_20-21-51_646153_2288/logs/ray-data
2025-02-19 21:46:18,232	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> AllToAllOperator[RandomShuffle] -> TaskPoolMapOperator[PartitionFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Project] -> AllToAllOperator[Aggregate]


Running 0: 0.00 row [00:00, ? row/s]

- ListFiles 1: 0.00 row [00:00, ? row/s]

- RandomShuffle 2: 0.00 row [00:00, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- PartitionFiles 5: 0.00 row [00:00, ? row/s]

- ReadFiles 6: 0.00 row [00:00, ? row/s]

- Map(add_class)->Project 7: 0.00 row [00:00, ? row/s]

- Aggregate 8: 0.00 row [00:00, ? row/s]

Sort Sample 9:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 10:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 11:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

2025-02-19 21:46:47,267	INFO dataset.py:2631 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2025-02-19 21:46:47,272	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-19_20-21-51_646153_2288/logs/ray-data
2025-02-19 21:46:47,272	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> AllToAllOperator[RandomShuffle] -> TaskPoolMapOperator[PartitionFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(Preprocessor.convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbeddingGenerator)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- ListFiles 1: 0.00 row [00:00, ? row/s]

- RandomShuffle 2: 0.00 row [00:00, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- PartitionFiles 5: 0.00 row [00:00, ? row/s]

- ReadFiles 6: 0.00 row [00:00, ? row/s]

- Map(add_class)->Map(Preprocessor.convert_to_label) 7: 0.00 row [00:00, ? row/s]

- MapBatches(EmbeddingGenerator) 8: 0.00 row [00:00, ? row/s]

- MapBatches(drop_columns) 9: 0.00 row [00:00, ? row/s]

- limit=1 10: 0.00 row [00:00, ? row/s]

[{'path': 'doggos-dataset/train/chihuahua/chihuahua_9351.jpg',
  'class': 'chihuahua',
  'label': 7,
  'embedding': array([ 5.14690615e-02,  4.56984550e-01, -1.96141481e-01,  1.96072102e-01,
          3.63849178e-02, -9.10888519e-03,  8.13270450e-01, -4.60949033e-01,
         -1.20805547e-01,  4.77093220e-01, -2.13734910e-01,  7.84118474e-02,
         -2.26395577e-02,  4.01064634e-01,  2.08187252e-01, -1.83848500e-01,
          6.94388509e-01,  1.16164386e-01,  2.97221482e-01, -1.05287388e-01,
         -2.95525976e-02,  6.30802810e-02,  8.02717268e-01, -4.17372920e-02,
         -1.29790753e-01,  2.96382383e-02,  3.50249171e-01,  8.16543251e-02,
         -8.15885141e-02,  5.16429879e-02,  2.69967824e-01,  2.62760997e-01,
         -2.02631578e-01, -4.41727526e-02,  1.73084587e-02, -1.95053637e-01,
          3.69352162e-01,  8.56357589e-02, -5.04600704e-01,  1.63473415e+00,
         -2.70452470e-01, -5.48010245e-02,  3.38966310e-01, -1.84092671e-01,
          2.83674031e-01, -5.40114641e-

<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- explain what's happening in the preprocessor.transform()
- mention the scale and fault tolerance
- mention how we can easily have heterogenous workloads
- mention how CPU/GPU idle time is eliminated with [streaming exection](https://www.anyscale.com/blog/streaming-distributed-execution-across-cpus-and-gpus) (and [visual](https://www.anyscale.com/blog/offline-batch-inference-comparing-ray-apache-spark-and-sagemaker))
- mention optimizations
- mention RayTurbo utils for data
- mention Ray Data advantages over spark (for unstructure and structured)

In [None]:
import IPython
IPython.get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

: 