In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
# Install idx2numpy package for extracting data
!pip install idx2numpy ray



In [33]:
# Import packages
import os
import gzip
import torch
import torchvision
import numpy as np 

import idx2numpy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [34]:
from ray import tune
# Import packages
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from functools import partial

In [35]:
def load_one_dataset(path):
    '''
    Convenience function to load a single dataset
    '''
    f = gzip.open(path, 'rb')
    data = torch.from_numpy(idx2numpy.convert_from_file(f))
    f.close()
    
    return(data)


def load_all_datasets(train_imgs, train_labs, test_imgs, test_labs, batch_size):
    '''
    Load training as well as test images here
    '''
    train_images = load_one_dataset(train_imgs).type(torch.float32)
    train_labels = load_one_dataset(train_labs).type(torch.long)
    train = list(zip(train_images, train_labels))

    test_images = load_one_dataset(test_imgs).type(torch.float32)
    test_labels = load_one_dataset(test_labs).type(torch.long)
    test = list(zip(test_images, test_labels))
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=2)
    
    return(train_loader, test_loader)

In [36]:
class Net(nn.Module):
  
  def __init__(self, nb_units, input_dim, output_dim):
    '''
    Declare the network architecture here
    '''
    super(Net, self).__init__()
    
    # Initialize a list to store layers
    fc = []

    # Add input and output dimensions to layer list
    self.nb_units = [input_dim] + nb_units + [output_dim]

    # Now compute the total no. of layers
    self.nb_layers = len(self.nb_units)

    # Now append the hidden layers
    for i in range(1, self.nb_layers):
      fc.append(nn.Linear(self.nb_units[i-1], self.nb_units[i]))
    
    # Wrap this in a module list 
    self.fc = nn.ModuleList(fc)
    
  
  def forward(self, x):
    '''
    Send input forward through 
    the network
    '''
    # Reshape 28X28 images to be 784 X 784
    x = x.view(-1, 28*28)

    # Send example through network
    for layer in self.fc: x = F.relu(layer(x))
    
    return x

In [37]:
def train(config, input_dim=784, output_dim = 10, epochs=2, 
          checkpoint_dir = '/content/drive/MyDrive/checkpoints/',
          data_dir = '/content/drive/MyDrive/data/'):
    '''
    This is the main training loop
    '''
    
    # Set paths to datasets
    paths = {
        'train_imgs': os.path.join(data_dir, 'train-images-idx3-ubyte.gz'),
        'train_labs': os.path.join(data_dir, 'train-labels-idx1-ubyte.gz'),
        'test_imgs': os.path.join(data_dir,'t10k-images-idx3-ubyte.gz'),
        'test_labs': os.path.join(data_dir,'t10k-labels-idx1-ubyte.gz')
    }
    
    # Load datasets
    train_loader, test_loader = load_all_datasets(**paths, batch_size = config['batch_size'])
    
    # Set parameters
    net = Net(config['nb_units'], input_dim, output_dim)

    # Set device
    if torch.cuda.is_available():
      device = torch.device("cuda")
    else:
      device = torch.device("cpu")
    
    # Send net object to device memory
    net.to(device)
    
    # We use the cross-entropy loss
    criterion = nn.CrossEntropyLoss()

    # We use mini-batch stochastic gradient descent with momentum
    optimizer = optim.Adam(net.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

    # Load from previously stored results if specified
    if checkpoint_dir:
      model_state, optimizer_state = torch.load(os.path.join(checkpoint_dir, "checkpoint"))
      net.load_state_dict(model_state)
      optimizer.load_state_dict(optimizer_state)
    
    # Loop over the dataset multiple times
    for epoch in range(epochs):  
        
        # Initialize running loss
        running_loss = 0.0
        running_accuracy = 0.0

        # Initialize the validation running loss
        val_running_loss = 0.0
        val_running_accuracy = 0.0
        
        # Iterate through data now
        for i, data in enumerate(train_loader):
            
            # Get the inputs: data is a list of [inputs, labels]
            inputs, labels = data
            
            # Send the inputs to the memory of the device
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward
            outputs = net(inputs)
            
            # Calculate loss
            loss = criterion(outputs, labels)

            # Get predictions
            _, preds = torch.max(outputs.data, 1)
            
            # Backward
            loss.backward()
            
            # Optimize
            optimizer.step()

            # Add to running loss
            running_loss += loss.item()

            # Add to running accuracy
            running_accuracy += (preds == labels).float().sum()
        
        # Loop through the validation data
        for j, data in enumerate(test_loader):
          
          # No need to calculate gradients for validation set
          with torch.no_grad():

              # Get the data item 
              val_inputs, val_labels = data
              val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)

              # Send the data item through the network to get output
              val_outputs = net(val_inputs)

              # Compute the loss
              val_loss = criterion(val_outputs, val_labels)

              # Get predictions
              _, val_preds = torch.max(val_outputs.data, 1)

              # Add to running loss
              val_running_loss += val_loss.item()

              # Add to running accuracy
              val_running_accuracy += (val_preds == val_labels).float().sum()
        
        # Rescale the training and validation perfomance metrics
        running_loss = running_loss/len(train_loader)
        running_accuracy = running_accuracy/(len(train_loader)*config['batch_size'])
        
        # Rescale the validation loss
        val_running_loss = val_running_loss/len(test_loader)
        val_running_accuracy = val_running_accuracy/(len(test_loader)*config['batch_size'])
        
        # Make print message format string
        msg = "{}, Epoch:{}, Loss:{}, Accuracy:{}," "\n"

        # Print performance
        print(msg.format("Training", epoch, running_loss, running_accuracy))
        print(msg.format("Validation", epoch, val_running_loss, val_running_accuracy))
        
        # Store the end of each epoch as a checkpoint
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
          path = os.path.join(checkpoint_dir, "checkpoint")
          torch.save((net.state_dict(), optimizer.state_dict()), path)
        
        # Send results back to tune to display in report
        tune.report(loss= val_running_loss, accuracy= val_running_accuracy)

    # Print message
    print('Done training...')

In [38]:
def get_tuning_search_space():
  '''
  Return a grid of configurable parameters for tuning
  '''
  n_layers = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

  config = {
      "nb_units" : [2**np.random.randint(2, 9) for i in range(n_layers)],
      "lr": tune.loguniform(1e-4, 1e-1),
      "momentum": tune.choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]),
      "batch_size": tune.choice([2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]),
      "weight_decay": tune.loguniform(1e-4, 1e-1),
  }

  return(config)

In [39]:
def main( gpus_per_trial = 1, num_samples = 5, max_num_epochs=50):

  config = get_tuning_search_space()
  data_dir = '/content/drive/MyDrive/data/'

  scheduler = ASHAScheduler(metric="loss", mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2)
  reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])

  result = tune.run(
    partial(train, data_dir=data_dir),
    resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
    config=config,
    num_samples=num_samples,
    scheduler=scheduler,
    progress_reporter=reporter,
    checkpoint_at_end=False)
  
  return(result)

In [40]:
result = main()

2020-12-01 18:45:03,266	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.23 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/DEFAULT_2020-12-01_18-45-03
Number of trials: 1/5 (1 RUNNING)
+---------------------+----------+-------+--------------+------------+------------+----------------+
| Trial name          | status   | loc   |   batch_size |         lr |   momentum |   weight_decay |
|---------------------+----------+-------+--------------+------------+------------+----------------|
| DEFAULT_52c1e_00000 | RUNNING  |       |           16 | 0.00402083 |        0.4 |     0.00539935 |
+---------------------+----------+-------+--------------+------------+------------+----------------+




[2m[36m(pid=2261)[0m   


Result for DEFAULT_52c1e_00000:
  accuracy: tensor(0.4886, device='cuda:0')
  date: 2020-12-01_18-45-48
  done: false
  experiment_id: 72b11e9f6f1140e58399295bce9e81be
  experiment_tag: 0_batch_size=16,lr=0.0040208,momentum=0.4,weight_decay=0.0053993
  hostname: f4686065c2f2
  iterations_since_restore: 1
  loss: 1.502837603187561
  node_ip: 172.28.0.2
  pid: 2261
  should_checkpoint: true
  time_since_restore: 43.58958196640015
  time_this_iter_s: 43.58958196640015
  time_total_s: 43.58958196640015
  timestamp: 1606848348
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 52c1e_00000
  
== Status ==
Memory usage on this node: 4.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -1.502837603187561
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.23 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/DEFAULT_2020-12-01_18-

[2m[36m(pid=2334)[0m   


Result for DEFAULT_52c1e_00001:
  accuracy: tensor(0.0998, device='cuda:0')
  date: 2020-12-01_18-46-55
  done: true
  experiment_id: a7223dce15fd49e18790c859d364e493
  experiment_tag: 1_batch_size=32,lr=0.00057782,momentum=0.3,weight_decay=0.088358
  hostname: f4686065c2f2
  iterations_since_restore: 1
  loss: 2.3025859902841974
  node_ip: 172.28.0.2
  pid: 2334
  should_checkpoint: true
  time_since_restore: 25.69222927093506
  time_this_iter_s: 25.69222927093506
  time_total_s: 25.69222927093506
  timestamp: 1606848415
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 52c1e_00001
  
== Status ==
Memory usage on this node: 4.9/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: -0.9654188490867615 | Iter 1.000: -1.902711796735879
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.23 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/DEFAULT

[2m[36m(pid=2380)[0m   


Result for DEFAULT_52c1e_00002:
  accuracy: tensor(0.4758, device='cuda:0')
  date: 2020-12-01_18-47-13
  done: false
  experiment_id: 649514b4cb7b463d83a97adba80c0db7
  experiment_tag: 2_batch_size=64,lr=0.00017751,momentum=0.2,weight_decay=0.0020784
  hostname: f4686065c2f2
  iterations_since_restore: 1
  loss: 1.339751679806193
  node_ip: 172.28.0.2
  pid: 2380
  should_checkpoint: true
  time_since_restore: 15.146131038665771
  time_this_iter_s: 15.146131038665771
  time_total_s: 15.146131038665771
  timestamp: 1606848433
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 52c1e_00002
  
== Status ==
Memory usage on this node: 4.9/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: -0.9654188490867615 | Iter 1.000: -1.502837603187561
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.23 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/DEF

[2m[36m(pid=2442)[0m   


Result for DEFAULT_52c1e_00003:
  accuracy: tensor(0.1000, device='cuda:0')
  date: 2020-12-01_18-48-12
  done: true
  experiment_id: 49ea565943a04078b7913b28c4b91546
  experiment_tag: 3_batch_size=16,lr=0.015186,momentum=0.8,weight_decay=0.00042144
  hostname: f4686065c2f2
  iterations_since_restore: 1
  loss: 2.3025853633880615
  node_ip: 172.28.0.2
  pid: 2442
  should_checkpoint: true
  time_since_restore: 46.69238042831421
  time_this_iter_s: 46.69238042831421
  time_total_s: 46.69238042831421
  timestamp: 1606848492
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 52c1e_00003
  
== Status ==
Memory usage on this node: 4.9/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: -0.841000564571247 | Iter 1.000: -1.9027114832878111
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.23 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/DEFAULT

[2m[36m(pid=2494)[0m   
2020-12-01 18:48:40,569	INFO tune.py:439 -- Total run time: 217.31 seconds (217.25 seconds for the tuning loop).


Result for DEFAULT_52c1e_00004:
  accuracy: tensor(0.0998, device='cuda:0')
  date: 2020-12-01_18-48-40
  done: true
  experiment_id: 128b83f096cf4c7a9afb1f8bbe442596
  experiment_tag: 4_batch_size=32,lr=0.01526,momentum=0.2,weight_decay=0.028967
  hostname: f4686065c2f2
  iterations_since_restore: 1
  loss: 2.3025848880743447
  node_ip: 172.28.0.2
  pid: 2494
  should_checkpoint: true
  time_since_restore: 25.229898691177368
  time_this_iter_s: 25.229898691177368
  time_total_s: 25.229898691177368
  timestamp: 1606848520
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 52c1e_00004
  
== Status ==
Memory usage on this node: 4.9/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: -0.841000564571247 | Iter 1.000: -2.3025848880743447
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.23 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/DEFAULT