In [1]:
import argparse
from typing import Dict
from ray.air import session

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import ray.train as train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig

# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="~/data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="~/data",
    train=False,
    download=True,
    transform=ToTensor(),
)


# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset) // session.get_world_size()
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def validate_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset) // session.get_world_size()
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n "
        f"Accuracy: {(100 * correct):>0.1f}%, "
        f"Avg loss: {test_loss:>8f} \n"
    )
    return test_loss


def train_func(config: Dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    worker_batch_size = batch_size // session.get_world_size()

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=worker_batch_size)
    test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)

    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)

    # Create model.
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        session.report(dict(loss=loss))


def train_fashion_mnist(num_workers=2, use_gpu=False):
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=1,
        help="Sets number of workers for training.",
    )
    parser.add_argument(
        "--use-gpu", action="store_true", default=True, help="Enables GPU training"
    )
    parser.add_argument(
        "--smoke-test",
        action="store_true",
        default=False,
        help="Finish quickly for testing.",
    )

    args, _ = parser.parse_known_args()

    import ray

    if args.smoke_test:
        # 2 workers + 1 for trainer.
        ray.init(num_cpus=3)
        train_fashion_mnist()
    else:
        ray.init(address=args.address)
        train_fashion_mnist(num_workers=args.num_workers, use_gpu=args.use_gpu)

2023-06-15 21:59:48,336	INFO worker.py:1432 -- Connecting to existing Ray cluster at address: 192.168.3.51:6378...
2023-06-15 21:59:48,350	INFO worker.py:1616 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Current time:,2023-06-15 22:00:40
Running for:,00:00:51.58
Memory:,19.7/62.4 GiB

Trial name,status,loc,iter,total time (s),loss
TorchTrainer_e42ce_00000,TERMINATED,192.168.3.51:946823,4,44.1484,1.86265


[2m[36m(RayTrainWorker pid=946899)[0m 2023-06-15 21:59:57,292	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=946899)[0m 2023-06-15 21:59:59,256	INFO train_loop_utils.py:286 -- Moving model to device: cuda:0


[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.302072  [    0/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.298983  [ 6400/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.294350  [12800/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.289459  [19200/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.275516  [25600/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.276099  [32000/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.252974  [38400/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.252532  [44800/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.248592  [51200/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.236028  [57600/60000]


Trial name,date,done,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
TorchTrainer_e42ce_00000,2023-06-15_22-00-37,True,0,mds-hp,4,1.86265,192.168.3.51,946823,44.1484,9.90029,44.1484,1686837637,4,e42ce_00000


[2m[36m(RayTrainWorker pid=946899)[0m Test Error: 
[2m[36m(RayTrainWorker pid=946899)[0m  Accuracy: 37.1%, Avg loss: 2.243820 
[2m[36m(RayTrainWorker pid=946899)[0m 
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.244646  [    0/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.255142  [ 6400/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.241847  [12800/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.236473  [19200/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.208714  [25600/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.206965  [32000/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.162524  [38400/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.153300  [44800/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.132694  [51200/60000]
[2m[36m(RayTrainWorker pid=946899)[0m loss: 2.150818  [57600/60000]
[2m[36m(RayTrainWorker pid=946899)[0m Test Error: 
[2m[36m(RayTrainWorker pid=946899)[0m  Accuracy: 37.4%, Av

2023-06-15 22:00:40,283	INFO tune.py:945 -- Total run time: 51.61 seconds (51.58 seconds for the tuning loop).


Last result: {'loss': 1.8626508948149954, 'timestamp': 1686837637, 'time_this_iter_s': 9.900285720825195, 'done': True, 'training_iteration': 4, 'trial_id': 'e42ce_00000', 'date': '2023-06-15_22-00-37', 'time_total_s': 44.148433208465576, 'pid': 946823, 'hostname': 'mds-hp', 'node_ip': '192.168.3.51', 'config': {'train_loop_config': {'lr': 0.001, 'batch_size': 64, 'epochs': 4}}, 'time_since_restore': 44.148433208465576, 'iterations_since_restore': 4, 'experiment_tag': '0'}
