## Ammar's XCS229ii experiments




In [1]:
# Stable Baselines only supports tensorflow 1.x for now
#%tensorflow_version 1.x
# !pip uninstall -y stable-baselines3[mpi]
# !pip install stable-baselines3[mpi]==2.10.0
!pip install git+https://github.com/DLR-RM/stable-baselines3.git

import matplotlib.pyplot as plt
import numpy as np


# function to show an image
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


UsageError: Line magic function `%tensorflow_version` not found.


In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 201.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
%matplotlib inline

import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 8

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


# separate out some training data to train the RL agent
half_data_size = int(len(trainset)/2)

rl_agent_trainset = torch.utils.data.Subset(trainset, range(0,int(0.8*half_data_size)))
rl_agent_testset = torch.utils.data.Subset(trainset, range(int(0.8*half_data_size), half_data_size))

hyp_opt_trainset = torch.utils.data.Subset(trainset, range(0,int(0.8*len(trainset))))
hyp_opt_testset = torch.utils.data.Subset(trainset, range(int(0.8*len(trainset)), len(trainset)))

print(f"Full dataset size:  train={len(trainset)} test={len(testset)}")
print(f"Use a subset of the training data to train the Hyp-RL agent : train={len(rl_agent_trainset)} val={len(rl_agent_testset)}")

print(f"Use a subset of the training data to compare RL agent against HypOpt baseline  : train={len(hyp_opt_trainset)} val={len(hyp_opt_testset)}")


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Full dataset size:  train=50000 test=10000
Use a subset of the training data to train the Hyp-RL agent : train=20000 val=5000
Use a subset of the training data to compare RL agent against HypOpt baseline  : train=40000 val=10000


In [4]:
## function to train and evaluate the model given the hyperparameter setting

## define the neural network
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def evaluateFullDataset(hp_learning_rate=0.001):
  full_train = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

  full_test = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)
  net = Net()
  loss_criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), lr=hp_learning_rate, momentum=0.9)
  trainAndEvaluateModel(net, loss_criterion, optimizerm, rl_agent_train, rl_agent_test)

def trainAndEvaluateModel(net, loss_criterion, optimizer, train, test):
  ## Train the model
  for epoch in range(2):  # loop over the dataset multiple times

      running_loss = 0.0
      for i, data in enumerate(train, 0):
          # get the inputs; data is a list of [inputs, labels]
          inputs, labels = data

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(inputs)
          loss = loss_criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          # print statistics
          running_loss += loss.item()
          if i % 2000 == 1999:    # print every 2000 mini-batches
              # print('[%d, %5d] loss: %.3f' %
              #       (epoch + 1, i + 1, running_loss / 2000))
              running_loss = 0.0
  #print('Finished Training')

  ## Test the model

  # # print images
  # dataiter = iter(test)
  # images, labels = dataiter.next()
  # imshow(torchvision.utils.make_grid(images))
  # print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
  # outputs = net(images)

  correct = 0
  total = 0
  # since we're not training, we don't need to calculate the gradients for our outputs
  with torch.no_grad():
      for data in test:
          images, labels = data
          # calculate outputs by running images through the network 
          outputs = net(images)
          # the class with the highest energy is what we choose as prediction
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  print(f"Accuracy of the network on the {len(test)} test images: {(100 * correct / total)}%")
  return (100 * correct / total)
  





## Build the RL environment and agent

In [5]:
import numpy as np
import gym
from gym import spaces
import copy

   
class TunableHP:
  def __init__(self, train_set, eval_set):
    self.hyperparameters = {"learning_rate":[0.0001, 0.001, 0.01, 0.1, 1.0],
                            "batch_size": [2,4, 6,8]}

    #self.hyperparameters = {"learning_rate":[-5,-4,-3,-2,-1,0,-1,-2,-3,-4,-5]}
    self.hyperparameter_keys = list(self.hyperparameters)

    self.train_set = train_set
    self.eval_set = eval_set

  def mapStateToHP(self,state):
    hp_dict = {}
    for p,i in enumerate(state):
      param_key = self.hyperparameter_keys[p]
      hp_dict[param_key] = self.hyperparameters[param_key][i]
    return hp_dict
  
  def getGridSize(self):
    return [len(self.hyperparameters[k]) for k in self.hyperparameter_keys]

  def evaluateRLAgent(self, hp_dict):
    print(f"Running evaluation for : {hp_dict}")
    rl_agent_train = torch.utils.data.DataLoader(self.train_set, batch_size=hp_dict['batch_size'],
                                            shuffle=True, num_workers=2)
    rl_agent_test = torch.utils.data.DataLoader(self.eval_set, batch_size=hp_dict['batch_size'],
                                          shuffle=False, num_workers=2)
    net = Net()
    loss_criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=hp_dict['learning_rate'], momentum=0.9)
    return trainAndEvaluateModel(net, loss_criterion, optimizer, rl_agent_train, rl_agent_test)

class HypRLGridEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}
  MAX_ITER = 10

  def __init__(self, tunableParams=TunableHP(rl_agent_trainset, rl_agent_testset)):
    super(HypRLGridEnv, self).__init__()

    self.tunableParams = tunableParams

    # Size of the grid
    self.grid_size = tunableParams.getGridSize()
    
    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions, we have two: left and right
    n_actions = 3
    self.action_space = spaces.Box(low=-1, high=1, shape=(len(self.grid_size),), dtype=np.int32)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.MultiDiscrete(self.grid_size)
    self.eval_cache = np.zeros(self.grid_size)

  def eval(self, state):
    state = tuple(state)
    if self.eval_cache[state] == [0.0]:
      # train & test the model for these hyperparameters
      self.eval_cache[state] = self.tunableParams.evaluateRLAgent(self.tunableParams.mapStateToHP(state))
    return self.eval_cache[state]

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # reset the number of iterations for this agent
    self.iter = 0
    # Initialize the agent at the right of the grid
    self.agent_state = np.random.randint(self.grid_size)
    self.reward = self.eval(self.agent_state)
    self.best = {'state':copy.deepcopy(self.agent_state), 'val':self.eval(self.agent_state)}
    self.visited = {}
    self.visited[tuple(self.agent_state)] = True
    return np.array(self.agent_state) 

  def step(self, action):
    self.iter += 1

    for i, _ in enumerate(action):
      self.agent_state[i] += action[i]
      # Account for the boundaries of the grid
      self.agent_state[i] = np.clip(self.agent_state[i], 0, self.grid_size[i]-1)

    # We are done when we visit the same state twice or have taken more iterations than MAX
    done = bool(self.iter >= self.MAX_ITER or tuple(self.agent_state) in self.visited)

    self.visited[tuple(self.agent_state)] = True

    # reward idea #1
    # Reward is minimum of whatever val loss we saw so far
    self.reward = max(self.reward, self.eval(self.agent_state))
    # Null reward everywhere except when the episode terminates
    reward = self.reward if done else 0

    # reward idea #2
    # set the reward to that observed in the final state
    #reward = self.eval(self.agent_state) if done else 0

    # reward idea #3
    # let the agent accumulate reward as it goes
    # self.reward += self.eval(self.agent_state)
    # reward = self.reward

    if self.eval(self.agent_state) > self.best['val']:
      self.best = {'state':copy.deepcopy(self.agent_state), 'val':self.eval(self.agent_state)}

    # Optionally we can pass additional info
    info = {}
    info['best'] = self.best
    info['visited'] = self.visited

    return np.array(self.agent_state), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("." * self.agent_state, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_state))

  def close(self):
    pass

# check and make sure the environment is sane and working
#from stable_baselines.common.env_checker import check_env

# If the environment doesn't follow the interface, an error will be thrown
# env = HypRLGridEnv()
# check_env(env, warn=True)
#env.render()

### RL Agent

In [None]:
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
#from stable_baselines3.common.policies import MlpPolicy
import pdb
# # Instantiate the env
env = HypRLGridEnv()
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

# Train the agent
##model = ACKTR('MlpPolicy', env, verbose=1).learn(5000)
model = A2C('MlpPolicy', env, verbose=0)
model.learn(total_timesteps=25000)


Running evaluation for : {'learning_rate': 0.001, 'batch_size': 4}
Accuracy of the network on the 1250 test images: 47.4%
Running evaluation for : {'learning_rate': 0.001, 'batch_size': 2}
Accuracy of the network on the 2500 test images: 43.06%
Running evaluation for : {'learning_rate': 0.0001, 'batch_size': 2}
Accuracy of the network on the 2500 test images: 35.06%
Running evaluation for : {'learning_rate': 0.1, 'batch_size': 2}
Accuracy of the network on the 2500 test images: 10.74%
Running evaluation for : {'learning_rate': 0.001, 'batch_size': 6}
Accuracy of the network on the 834 test images: 42.92%
Running evaluation for : {'learning_rate': 0.0001, 'batch_size': 6}
Accuracy of the network on the 834 test images: 15.78%
Running evaluation for : {'learning_rate': 0.0001, 'batch_size': 8}
Accuracy of the network on the 625 test images: 13.06%
Running evaluation for : {'learning_rate': 0.001, 'batch_size': 8}
Accuracy of the network on the 625 test images: 39.66%
Running evaluation f

<stable_baselines3.a2c.a2c.A2C at 0x7f2f18fe6a10>

In [None]:
# Test the trained agent for sanity checking on the same environment

obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  # print("Step {}".format(step + 1))
  # print("Action: ", action)
  #pdb.set_trace()
  obs, reward, done, info = env.step(action)
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward, "final_state=", info[0]['terminal_observation'], "best=", info[0]['best'])
    print(f"info {info}")
    break
  print('obs=', obs, 'reward=', reward, 'done=', done, 'info', info)
  #env.render(mode='console')

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
print(f"{env.envs[0].eval_cache}")

obs= [[2 2]] reward= [0.] done= [False] info [{'best': {'state': array([2, 2]), 'val': 37.14}, 'visited': {(3, 1): True, (2, 2): True}}]
obs= [[1 1]] reward= [0.] done= [False] info [{'best': {'state': array([1, 1]), 'val': 47.4}, 'visited': {(3, 1): True, (2, 2): True, (1, 1): True}}]
Goal reached! reward= [47.4] final_state= [2 2] best= {'state': array([1, 1]), 'val': 47.4}
info [{'best': {'state': array([1, 1]), 'val': 47.4}, 'visited': {(3, 1): True, (2, 2): True, (1, 1): True}, 'episode': {'r': 47.4, 'l': 3, 't': 1173.046669}, 'terminal_observation': array([2, 2])}]
[[35.060 20.380 15.780 13.060]
 [43.060 47.400 42.920 39.660]
 [9.920 28.880 37.140 43.780]
 [10.740 9.980 10.040 10.220]
 [10.040 10.060 10.040 10.040]]


## Time to perform

In [None]:
# Instantiate a full environment
env_real = HypRLGridEnv(TunableHP(hyp_opt_trainset, hyp_opt_trainset))
# wrap it
env_real = make_vec_env(lambda: env_real, n_envs=1)



In [None]:
# Test the trained agent on a new and full environment of the same dataset
obs = env_real.reset()
print('obs=', obs)
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  #pdb.set_trace()
  obs, reward, done, info = env_real.step(action)
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward, "final_state=", info[0]['terminal_observation'])
    break
  print('obs=', obs, 'reward=', reward, 'done=', done)
print(env_real.envs[0].eval_cache)

Running evaluation for : {'learning_rate': 0.0001, 'batch_size': 8}
Accuracy of the network on the 5000 test images: 21.8875%
obs= [[0 3]]
Step 1
Action:  [[-1.000 -1.000]]
Running evaluation for : {'learning_rate': 0.0001, 'batch_size': 6}
Accuracy of the network on the 6667 test images: 26.0325%
obs= [[0 2]] reward= [0.000] done= [False]
Step 2
Action:  [[-1.000 -1.000]]
Running evaluation for : {'learning_rate': 0.0001, 'batch_size': 4}
Accuracy of the network on the 10000 test images: 35.5425%
obs= [[0 1]] reward= [0.000] done= [False]
Step 3
Action:  [[-1.000 -0.297]]
Running evaluation for : {'learning_rate': 0.0001, 'batch_size': 2}
Accuracy of the network on the 20000 test images: 43.1475%
obs= [[0 0]] reward= [0.000] done= [False]
Step 4
Action:  [[-1.000 -1.000]]
Goal reached! reward= [43.147] final_state= [0 0]
[[43.148 35.542 26.032 21.887]
 [0.000 0.000 0.000 0.000]
 [0.000 0.000 0.000 0.000]
 [0.000 0.000 0.000 0.000]
 [0.000 0.000 0.000 0.000]]


In [None]:
print(env_real.envs[0].eval_cache)
obs = env_real.reset()
print('obs=', obs)

[[43.148 35.542 26.032 21.887]
 [0.000 0.000 0.000 0.000]
 [0.000 0.000 0.000 0.000]
 [0.000 0.000 0.000 0.000]
 [0.000 0.000 0.000 0.000]]
obs= [[0 1]]


In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True)
kfold.split(trainset)
for fold, (train_ids, test_ids) in enumerate(kfold.split(trainset)):
  print(f"fold {fold} ... train {len(train_ids)} ... test {len(test_ids)}")

kfold.get_n_splits()

fold 0 ... train 45000 ... test 5000
fold 1 ... train 45000 ... test 5000
fold 2 ... train 45000 ... test 5000
fold 3 ... train 45000 ... test 5000
fold 4 ... train 45000 ... test 5000
fold 5 ... train 45000 ... test 5000
fold 6 ... train 45000 ... test 5000
fold 7 ... train 45000 ... test 5000
fold 8 ... train 45000 ... test 5000
fold 9 ... train 45000 ... test 5000


10