<a href="https://colab.research.google.com/github/ammarhusain/XCS229ii-project/blob/master/XCS229ii-project/xcs229ii_sandbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Ammar's XCS229ii experiments




In [1]:
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x
!pip install stable-baselines[mpi]==2.10.0

import matplotlib.pyplot as plt
import numpy as np

# function to show an image
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


TensorFlow 1.x selected.
Collecting stable-baselines[mpi]==2.10.0
  Downloading stable_baselines-2.10.0-py3-none-any.whl (248 kB)
[K     |████████████████████████████████| 248 kB 4.1 MB/s 
Installing collected packages: stable-baselines
  Attempting uninstall: stable-baselines
    Found existing installation: stable-baselines 2.2.1
    Uninstalling stable-baselines-2.2.1:
      Successfully uninstalled stable-baselines-2.2.1
Successfully installed stable-baselines-2.10.0


In [3]:
%matplotlib inline

import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 8

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


# separate out some training data to train the RL agent
half_data_size = int(len(trainset)/2)

rl_agent_trainset = torch.utils.data.Subset(trainset, range(0,int(0.8*half_data_size)))
rl_agent_testset = torch.utils.data.Subset(trainset, range(int(0.8*half_data_size), half_data_size))

hyp_opt_trainset = torch.utils.data.Subset(trainset, range(0,int(0.8*len(trainset))))
hyp_opt_testset = torch.utils.data.Subset(trainset, range(int(0.8*len(trainset)), len(trainset)))

print(f"Full dataset size:  train={len(trainset)} test={len(testset)}")
print(f"Use a subset of the training data to train the Hyp-RL agent : train={len(rl_agent_trainset)} val={len(rl_agent_testset)}")

print(f"Use a subset of the training data to compare RL agent against HypOpt baseline  : train={len(hyp_opt_trainset)} val={len(hyp_opt_testset)}")


Files already downloaded and verified
Files already downloaded and verified
Full dataset size:  train=50000 test=10000
Use a subset of the training data to train the Hyp-RL agent : train=20000 val=5000
Use a subset of the training data to compare RL agent against HypOpt baseline  : train=40000 val=10000


In [4]:
## function to train and evaluate the model given the hyperparameter setting

## define the neural network
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


def evaluateFullDataset(hp_learning_rate=0.001):
  full_train = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

  full_test = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)
  net = Net()
  loss_criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), lr=hp_learning_rate, momentum=0.9)
  trainAndEvaluateModel(net, loss_criterion, optimizerm, rl_agent_train, rl_agent_test)

def trainAndEvaluateModel(net, loss_criterion, optimizer, train, test):
  ## Train the model
  for epoch in range(2):  # loop over the dataset multiple times

      running_loss = 0.0
      for i, data in enumerate(train, 0):
          # get the inputs; data is a list of [inputs, labels]
          inputs, labels = data

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(inputs)
          loss = loss_criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          # print statistics
          running_loss += loss.item()
          if i % 2000 == 1999:    # print every 2000 mini-batches
              # print('[%d, %5d] loss: %.3f' %
              #       (epoch + 1, i + 1, running_loss / 2000))
              running_loss = 0.0
  #print('Finished Training')

  ## Test the model

  # # print images
  # dataiter = iter(test)
  # images, labels = dataiter.next()
  # imshow(torchvision.utils.make_grid(images))
  # print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
  # outputs = net(images)

  correct = 0
  total = 0
  # since we're not training, we don't need to calculate the gradients for our outputs
  with torch.no_grad():
      for data in test:
          images, labels = data
          # calculate outputs by running images through the network 
          outputs = net(images)
          # the class with the highest energy is what we choose as prediction
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  print(f"Accuracy of the network on the {len(test)} test images: {(100 * correct / total)}%")
  return (100 * correct / total)
  

#evaluateFullDataset()

## Build the RL environment and agent

In [11]:
import numpy as np
import gym
from gym import spaces


   
class TunableHP:
  def __init__(self, train_set, eval_set):
    self.hyperparameters = {"learning_rate":[0.0001, 0.001, 0.01, 0.1, 1.0]}

    #self.hyperparameters = {"learning_rate":[-5,-4,-3,-2,-1,0,-1,-2,-3,-4,-5]}
    self.hyperparameter_keys = list(self.hyperparameters)

    self.train_set = train_set
    self.eval_set = eval_set

  def mapStateToHP(self,state):
    return [self.hyperparameters[self.hyperparameter_keys[p]][i] for p,i in enumerate(state)]
  
  def getGridSize(self):
    return [len(self.hyperparameters[k]) for k in self.hyperparameter_keys]

  def evaluateRLAgent(self, hp_learning_rate):
    print(f"Running evaluation for : {hp_learning_rate}")
    #return hp_learning_rate
    rl_agent_train = torch.utils.data.DataLoader(self.train_set, batch_size=batch_size,
                                            shuffle=True, num_workers=2)
    rl_agent_test = torch.utils.data.DataLoader(self.eval_set, batch_size=batch_size,
                                          shuffle=False, num_workers=2)
    net = Net()
    loss_criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=hp_learning_rate, momentum=0.9)
    return trainAndEvaluateModel(net, loss_criterion, optimizer, rl_agent_train, rl_agent_test)


class HypRLGridEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  This is a simple env where the agent must learn to go always left. 
  """
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {'render.modes': ['console']}
  # Define constants for clearer code
  UP = 0
  DOWN = 1
  STAY = 2
  MAX_ITER = 10

  def __init__(self, tunableParams=TunableHP(rl_agent_trainset, rl_agent_testset)):
    super(HypRLGridEnv, self).__init__()

    self.tunableParams = tunableParams

    # Size of the grid
    self.grid_size = tunableParams.getGridSize()
    
    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions, we have two: left and right
    n_actions = 3
    self.action_space = spaces.Discrete(n_actions)
    # The observation will be the coordinate of the agent
    # this can be described both by Discrete and Box space
    self.observation_space = spaces.MultiDiscrete(self.grid_size)

    self.eval_cache = np.zeros(self.grid_size)

  def eval(self, state):
    state = tuple(state)
    if self.eval_cache[state] == [0.0]:
      # train & test the model for these hyperparameters
      self.eval_cache[state] = self.tunableParams.evaluateRLAgent(*self.tunableParams.mapStateToHP(state))
    return self.eval_cache[state]

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # reset the number of iterations for this agent
    self.iter = 0
    self.reward = -np.inf
    # Initialize the agent at the right of the grid
    self.agent_state = np.random.randint(self.grid_size)

    start_state = self.agent_state
    self.visited = {}
    return np.array(start_state) 

  def step(self, action):
    self.iter += 1

    if action == self.UP:
      self.agent_state -= 1
    elif action == self.DOWN:
      self.agent_state += 1
    elif action == self.STAY:
      self.agent_state = self.agent_state      
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))
    # Account for the boundaries of the grid
    for i, _ in enumerate(self.agent_state):
      self.agent_state[i] = np.clip(self.agent_state[i], 0, self.grid_size[i]-1)

    # We are done when we visit the same state twice or have taken more iterations than MAX
    done = bool(self.iter >= self.MAX_ITER or tuple(self.agent_state) in self.visited)

    self.visited[tuple(self.agent_state)] = True

    # Reward is minimum of whatever val loss we saw so far
    self.reward = max(self.reward, self.eval(self.agent_state))

    # Null reward everywhere except when the episode terminates
    reward = self.reward if done else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}
    return np.array(self.agent_state), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # agent is represented as a cross, rest as a dot
    print("." * self.agent_state, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_state))

  def close(self):
    pass

# check and make sure the environment is sane and working
from stable_baselines.common.env_checker import check_env

#env = HypRLGridEnv()
# If the environment doesn't follow the interface, an error will be thrown
#check_env(env, warn=True)
#env.render()

### RL Agent

In [12]:
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.policies import MlpPolicy
import pdb
# Instantiate the env
env = HypRLGridEnv()
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

# Train the agent
##model = ACKTR('MlpPolicy', env, verbose=1).learn(5000)
model = A2C(MlpPolicy, env, verbose=0)
model.learn(total_timesteps=25000)


Running evaluation for : 1.0
Accuracy of the network on the 625 test images: 9.72%
Running evaluation for : 0.0001
Accuracy of the network on the 625 test images: 10.38%
Running evaluation for : 0.001
Accuracy of the network on the 625 test images: 41.58%
Running evaluation for : 0.01
Accuracy of the network on the 625 test images: 36.12%
Running evaluation for : 0.1
Accuracy of the network on the 625 test images: 9.62%


<stable_baselines.a2c.a2c.A2C at 0x7f06f291c190>

In [18]:
# Test the trained agent for sanity checking on the same environment
obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  #pdb.set_trace()
  obs, reward, done, info = env.step(action)
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward, "final_state=", info[0]['terminal_observation'])
    break
  print('obs=', obs, 'reward=', reward, 'done=', done)
  #env.render(mode='console')
print(env.envs[0].eval_cache)

Step 1
Action:  [2]
obs= [[1]] reward= [0.] done= [False]
Step 2
Action:  [2]
Goal reached! reward= [41.58] final_state= [1]
[10.38 41.58 36.12  9.62  9.72]


## Time to perform

In [19]:
# Instantiate a full environment
env_real = HypRLGridEnv(TunableHP(hyp_opt_trainset, hyp_opt_trainset))
# wrap it
env_real = make_vec_env(lambda: env_real, n_envs=1)



In [20]:
# Test the trained agent on a new and full environment of the same dataset
obs = env_real.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  #pdb.set_trace()
  obs, reward, done, info = env_real.step(action)
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward, "final_state=", info[0]['terminal_observation'])
    break
  print('obs=', obs, 'reward=', reward, 'done=', done)
print(env_real.envs[0].eval_cache)

Step 1
Action:  [0]
Running evaluation for : 0.01
Accuracy of the network on the 5000 test images: 43.26%
obs= [[2]] reward= [0.] done= [False]
Step 2
Action:  [0]
Running evaluation for : 0.001
Accuracy of the network on the 5000 test images: 50.0275%
obs= [[1]] reward= [0.] done= [False]
Step 3
Action:  [2]
Goal reached! reward= [50.0275] final_state= [1]
[ 0.     50.0275 43.26    0.      0.    ]


In [32]:
print(env.envs[0].eval_cache)

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_get_indices', '_get_target_envs', '_obs_from_buf', '_save_obs', 'action_space', 'actions', 'buf_dones', 'buf_infos', 'buf_obs', 'buf_rews', 'close', 'env_method', 'envs', 'get_attr', 'get_images', 'getattr_depth_check', 'keys', 'metadata', 'num_envs', 'observation_space', 'render', 'reset', 'seed', 'set_attr', 'step', 'step_async', 'step_wait', 'unwrapped']
[13.68 41.24  0.    0.    0.  ]
