# Getting Started with RayTune

In [34]:
import os 

import ray

from torchvision import datasets, transforms

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from filelock import FileLock

In [35]:
EPOCH_SIZE = 512
TEST_SIZE = 256

## What you might have on a single node

https://github.com/pytorch/examples/blob/master/mnist/main.py

In [50]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
        self.fc = nn.Linear(192, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = x.view(-1, 192)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

In [51]:
def get_data_loaders():
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    # We add FileLock here because multiple workers will want to
    # download data, and this may cause overwrites since
    # DataLoader is not threadsafe.
    # This is only relevant in the distributed 
    with FileLock(os.path.expanduser("~/data.lock")):
        train_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                "/tmp/data",
                train=True,
                download=True,
                transform=mnist_transforms),
            batch_size=64,
            shuffle=True)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST("/tmp/data", train=False, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)
    return train_loader, test_loader

In [52]:
def train(model, optimizer, train_loader, device=torch.device("cpu")):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if batch_idx * len(data) > EPOCH_SIZE:
            return
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()

In [53]:
def test(model, data_loader, device=torch.device("cpu")):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            if batch_idx * len(data) > TEST_SIZE:
                break
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return correct / total

In [54]:
def train_mnist(config):
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(model.parameters(), lr=config["lr"])
    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)

In [55]:
train_mnist({"lr": 0.01})

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /tmp/data/MNIST/raw/train-images-idx3-ubyte.gz


100.1%

Extracting /tmp/data/MNIST/raw/train-images-idx3-ubyte.gz to /tmp/data/MNIST/raw


28.4%

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /tmp/data/MNIST/raw/train-labels-idx1-ubyte.gz


0.5%5%

Extracting /tmp/data/MNIST/raw/train-labels-idx1-ubyte.gz to /tmp/data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /tmp/data/MNIST/raw/t10k-images-idx3-ubyte.gz


100.4%

Extracting /tmp/data/MNIST/raw/t10k-images-idx3-ubyte.gz to /tmp/data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /tmp/data/MNIST/raw/t10k-labels-idx1-ubyte.gz


180.4%

Extracting /tmp/data/MNIST/raw/t10k-labels-idx1-ubyte.gz to /tmp/data/MNIST/raw
Processing...
Done!


In [56]:
from ray import tune

In [57]:
def train_mnist(config):
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(model.parameters(), lr=config["lr"])
    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        tune.track.log(mean_accuracy=acc)

In [58]:
analysis = tune.run(train_mnist, config={"lr": tune.grid_search([0.001, 0.01, 0.1])})

Trial name,status,loc,lr
train_mnist_00000,RUNNING,,0.001
train_mnist_00001,PENDING,,0.01
train_mnist_00002,PENDING,,0.1


Result for train_mnist_00001:
  date: 2020-04-09_16-50-48
  done: false
  experiment_id: 99520af2a2de482b8503bc0db861e755
  experiment_tag: 1_lr=0.01
  hostname: billmp
  iterations_since_restore: 1
  mean_accuracy: 0.115625
  node_ip: 192.168.1.181
  pid: 20281
  time_since_restore: 0.2769598960876465
  time_this_iter_s: 0.2769598960876465
  time_total_s: 0.2769598960876465
  timestamp: 1586476248
  timesteps_since_restore: 0
  training_iteration: 0
  trial_id: '00001'
  
Result for train_mnist_00002:
  date: 2020-04-09_16-50-48
  done: false
  experiment_id: 6bfa9d80cfa24d3a85482791c115764d
  experiment_tag: 2_lr=0.1
  hostname: billmp
  iterations_since_restore: 1
  mean_accuracy: 0.359375
  node_ip: 192.168.1.181
  pid: 20287
  time_since_restore: 0.347430944442749
  time_this_iter_s: 0.347430944442749
  time_total_s: 0.347430944442749
  timestamp: 1586476248
  timesteps_since_restore: 0
  training_iteration: 0
  trial_id: '00002'
  
Result for train_mnist_00000:
  date: 2020-04-09

Trial name,status,loc,lr,acc,iter,total time (s)
train_mnist_00000,TERMINATED,,0.001,0.13125,9,2.33092
train_mnist_00001,TERMINATED,,0.01,0.740625,9,2.17168
train_mnist_00002,TERMINATED,,0.1,0.78125,9,2.25306


In [59]:
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

# Get a dataframe for analyzing trial results.
df = analysis.dataframe()

Best config:  {'lr': 0.1}
