In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import ray
import warnings
warnings.filterwarnings("ignore")

In [3]:
ray.init(
    runtime_env={
        "env_vars": {
            "RAY_TRAIN_V2_ENABLED": "1",
        },
    },
)

2025-02-19 20:55:25,857	INFO worker.py:1636 -- Connecting to existing Ray cluster at address: 10.0.55.44:6379...
2025-02-19 20:55:25,867	INFO worker.py:1812 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-3apyclh1wkjd883fht6vsy8xjh.i.anyscaleuserdata.com [39m[22m
2025-02-19 20:55:25,880	INFO packaging.py:393 -- Pushing file package 'gcs://_ray_pkg_f885d3cf135bdb44f9f4e69d498ad1eeb618e0b3.zip' (2.63MiB) to Ray cluster...
2025-02-19 20:55:25,905	INFO packaging.py:406 -- Successfully pushed file package 'gcs://_ray_pkg_f885d3cf135bdb44f9f4e69d498ad1eeb618e0b3.zip'.


0,1
Python version:,3.12.2
Ray version:,2.40.0
Dashboard:,http://session-3apyclh1wkjd883fht6vsy8xjh.i.anyscaleuserdata.com


In [4]:
%%bash
# This will be removed once trainv2 is pushed
echo "RAY_TRAIN_V2_ENABLED=1" > .env

In [5]:
from dotenv import load_dotenv
load_dotenv()

True

### Preprocess

In [6]:
from doggos.data import Preprocessor
from doggos.utils import add_class

In [7]:
# Preprocess data splits
train_ds = ray.data.read_images("s3://doggos-dataset/train", include_paths=True, shuffle="files")
train_ds = train_ds.map(add_class)
val_ds = ray.data.read_images("s3://doggos-dataset/val", include_paths=True)
val_ds = val_ds.map(add_class)

In [8]:
# Preprocess
preprocessor = Preprocessor()
preprocessor = preprocessor.fit(train_ds, column="class")
train_ds = preprocessor.transform(ds=train_ds)
val_ds = preprocessor.transform(ds=val_ds)

2025-02-19 20:55:30,787	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-19_20-21-51_646153_2288/logs/ray-data
2025-02-19 20:55:30,788	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> AllToAllOperator[RandomShuffle] -> TaskPoolMapOperator[PartitionFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Project] -> AllToAllOperator[Aggregate]


Running 0: 0.00 row [00:00, ? row/s]

- ListFiles 1: 0.00 row [00:00, ? row/s]

- RandomShuffle 2: 0.00 row [00:00, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- PartitionFiles 5: 0.00 row [00:00, ? row/s]

- ReadFiles 6: 0.00 row [00:00, ? row/s]

- Map(add_class)->Project 7: 0.00 row [00:00, ? row/s]

- Aggregate 8: 0.00 row [00:00, ? row/s]

Sort Sample 9:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 10:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 11:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

### Model

In [9]:
from doggos.model import ClassificationModel

In [10]:
# Initialize model
num_classes = len(preprocessor.classes)
model = ClassificationModel(
    embedding_dim=512, 
    hidden_dim=256, 
    dropout_p=0.3, 
    num_classes=num_classes,
)
print (model.named_parameters)

<bound method Module.named_parameters of ClassificationModel(
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (batch_norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=256, out_features=36, bias=True)
)>


<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- mention that this is all the same Pytorch model / code

### Batching

In [11]:
from doggos.model import collate_fn

In [12]:
# Sample batch
sample_batch = train_ds.take_batch(batch_size=3)
collate_fn(batch=sample_batch)

2025-02-19 20:55:59,341	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-19_20-21-51_646153_2288/logs/ray-data
2025-02-19 20:55:59,342	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ListFiles] -> AllToAllOperator[RandomShuffle] -> TaskPoolMapOperator[PartitionFiles] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(add_class)->Map(Preprocessor.convert_to_label)] -> ActorPoolMapOperator[MapBatches(EmbeddingGenerator)] -> TaskPoolMapOperator[MapBatches(drop_columns)] -> LimitOperator[limit=3]


Running 0: 0.00 row [00:00, ? row/s]

- ListFiles 1: 0.00 row [00:00, ? row/s]

- RandomShuffle 2: 0.00 row [00:00, ? row/s]

Shuffle Map 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- PartitionFiles 5: 0.00 row [00:00, ? row/s]

- ReadFiles 6: 0.00 row [00:00, ? row/s]

- Map(add_class)->Map(Preprocessor.convert_to_label) 7: 0.00 row [00:00, ? row/s]

- MapBatches(EmbeddingGenerator) 8: 0.00 row [00:00, ? row/s]

- MapBatches(drop_columns) 9: 0.00 row [00:00, ? row/s]

- limit=3 10: 0.00 row [00:00, ? row/s]



{'embedding': tensor([[-0.0223,  0.4840, -0.2503,  ...,  0.4477,  0.8372,  0.2221],
         [-0.0402,  0.3278, -0.1676,  ...,  1.7848,  0.1375,  0.4853],
         [-0.0475,  0.0736,  0.1093,  ...,  0.1440, -0.2357,  0.1500]]),
 'label': tensor([ 6,  7, 13])}

### Train loop

In [13]:
import mlflow
from pathlib import Path
from ray.air.integrations.mlflow import MLflowLoggerCallback
import time

<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- made it clearn how little change was required
- remove Ray AIR dependency
- updates with Train V2 work

In [14]:
# Train loop config
model_registry = "/mnt/user_storage/mlflow"
os.makedirs(model_registry, exist_ok=True)
experiment_name = "doggos"
train_loop_config = {
    "model_registry": model_registry,
    "experiment_name": experiment_name,
    "embedding_dim": 512,
    "hidden_dim": 256,
    "dropout_p": 0.3,
    "lr": 1e-3,
    "lr_factor": 0.8,
    "lr_patience": 3,
    "num_epochs": 20,
    "batch_size": 256,
}

<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- add links to mlflow and W&B integration Ray docs pages

In [15]:
# Scaling config
num_workers = 4
scaling_config = ray.train.ScalingConfig(
    num_workers=num_workers,
    use_gpu=True,
    resources_per_worker={"CPU": 8, "GPU": 1})

### Trainer

```bash
mlflow server -h 0.0.0.0 -p 8080 --backend-store-uri /mnt/user_storage/mlflow
```

<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- add metrics snapshots from mlflow dashboard
- mention how to view other ports
- show Train dashboard

In [16]:
from ray.train.torch import TorchTrainer

In [17]:
from doggos.data import Preprocessor
from doggos.train import train_loop_per_worker
from doggos.utils import add_class, set_seeds

In [23]:
# Trainer
set_seeds()
train_loop_config["class_to_label"] = preprocessor.class_to_label
train_loop_config["num_classes"] = len(preprocessor.class_to_label)
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_loop_config,
    scaling_config=scaling_config,
    datasets={"train": train_ds, "val": val_ds},
)

<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- do a delta of the train_loop and explain (clarify that it's very few changes)
- breakdown checkpointing and transparency
- mention advantages compared to ddp
- mention scale and fault tolerance
- mention ability to do this with more available commodity hardware instead of large, expensive and rarely available nodes

In [None]:
# Train
results = trainer.fit()

(autoscaler +30m18s) Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
(autoscaler +1h31m32s) [autoscaler] Downscaling node i-0cb624b652d6ef685 (node IP: 10.0.103.180) due to node idle termination.
(autoscaler +1h31m32s) [autoscaler] Downscaling node i-02fe9a53fda8a6d64 (node IP: 10.0.111.20) due to node idle termination.


In [25]:
# Sorted runs
mlflow.set_tracking_uri(f"file:{model_registry}")
sorted_runs = mlflow.search_runs(
    experiment_names=[experiment_name], 
    order_by=["metrics.val_loss ASC"])
best_run = sorted_runs.iloc[0]
best_run

run_id                                      b21fe4c9bf42432bbbe216f8c86f58f1
experiment_id                                             811160940632726903
status                                                              FINISHED
artifact_uri               file:///mnt/user_storage/mlflow/81116094063272...
start_time                                  2025-02-14 22:39:50.663000+00:00
end_time                                    2025-02-14 22:52:24.720000+00:00
metrics.lr                                                             0.001
metrics.train_loss                                                  0.371551
metrics.val_loss                                                    0.667554
params.num_epochs                                                         20
params.lr                                                              0.001
params.embedding_dim                                                     512
params.batch_size                                                        256

[36m(autoscaler +17m42s)[0m [autoscaler] Downscaling node i-09245fb85d606077d (node IP: 10.0.41.61) due to node idle termination.
[36m(autoscaler +17m42s)[0m [autoscaler] Downscaling node i-0a1015c5cb6f56103 (node IP: 10.0.29.99) due to node idle termination.


```bash
# Production offline job
anyscale job submit --name=train-model \
  --image-uri="anyscale/image/doggos:3" \
  --compute-config=doggos:1 \
  --working-dir=. \
  --exclude="" \
  --max-retries=0 \
  -- python doggos/train.py
```

<p style="font-weight: bold; background-color: yellow; padding: 10px; display: inline;">TODO</p>

- best practice to do just a few epochs first (and see train and val loss decrease)
- best practice to do data processing outside of the train loop so it just happens once (embedding and preprocessing in our case)
- easy to wrap all of this as just another job
- mention continue model training
- mention RayTurbo train utils (elastic training)
- mention Ray Tune
- add links to other relevant docs examples
- [rayturbo train](https://docs.anyscale.com/rayturbo/generated/rayturbo-train) features

In [26]:
import IPython
IPython.get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

: 