## Action Space

- {'steering_angle': -30, 'speed': 0.6},
- {'steering_angle': -15, 'speed': 0.6},
- {'steering_angle': 0, 'speed': 0.6},
- {'steering_angle': 15, 'speed': 0.6},
- {'steering_angle': 30, 'speed': 0.6}

Can be found in "/opt/ml/code/custom_files/agent/model_metadata.json"

### Imports

In [2]:
from stable_baselines3 import A2C

import utils
import deepracer_gym
import matplotlib.pyplot as plt
import cv2
import numpy as np
import gym

import sys
sys.path.append('..')

from agents.deepracer_base_agent import DeepracerAgent

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from stable_baselines3.common.vec_env import VecExtractDictObs
from stable_baselines3.common.monitor import Monitor

%matplotlib notebook

# A2C

## Extract Features from dict

In [60]:
# class BasicExtractor(BaseFeaturesExtractor):
#     """
#     :param observation_space: (gym.Space)
#     :param features_dim: (int) Number of features extracted.
#         This corresponds to the number of unit for the last layer.
#     """

#     def __init__(self, observation_space, features_dim = 256, **kwargs):
#         super(BasicExtractor, self).__init__(observation_space, features_dim)

#     def forward(self, observations,  **kwargs):
#         tens = observations['STEREO_CAMERAS']
        
#         return torch.tensor(segment_resize(tens[0,0].cpu().numpy())).to('cuda:0')
    
# policy_kwargs = dict(
#     features_extractor_class=BasicExtractor,
#     features_extractor_kwargs=dict(features_dim=128),
# )


## Create Environment

In [44]:
env = DummyVecEnv([lambda: Monitor(gym.make('deepracer_gym:deepracer-v0', port=8889))])
env = VecTransposeImage(env)
env = VecExtractDictObs(env, key="STEREO_CAMERAS")

model = A2C('CnnPolicy', env ,tensorboard_log="./a2c_env/")
model.learn(total_timesteps=10_000)

<stable_baselines3.a2c.a2c.A2C at 0x7fd2c2eb2908>

### Render some episodes

In [47]:
saved_obs  = []
saved_acts = []

obs = env.reset()

for i in range(5000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    
    if i%5 == 0:
        saved_obs.append(obs)
        saved_acts.append(action)

    if done:
      obs = env.reset()

In [48]:
saved_acts

[array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),

### Add action selected on top of images

In [49]:
font                   = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (10,25)
fontScale              = 1
fontColor              = (255)
lineType               = 2


add_text = lambda img,txt: cv2.putText(img.astype('float'),txt, 
    bottomLeftCornerOfText, 
    font, 
    fontScale,
    fontColor,
    lineType)


real_obs = [add_text(img[0,0], str(txt[0])) for img,txt in zip(saved_obs, saved_acts)]

## Animate

In [50]:
%%capture
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
plt.rcParams["animation.html"] = "jshtml"

fig, ax = plt.subplots()

ims = []
for i, obs in enumerate(real_obs[:100]):
    im = ax.imshow(obs, animated=True)
    if i == 0:
        ax.imshow(obs)  # show an initial one first
    ims.append([im])

ani = animation.ArtistAnimation(fig, ims, interval=100, blit=True,
                                repeat_delay=1000)

In [58]:
import torch

In [59]:
torch.tensor(saved_obs[0])

tensor([[[[ 74,  74,  74,  ...,  74,  74,  74],
          [ 74,  74,  74,  ...,  74,  74,  74],
          [ 74,  74,  74,  ...,  74,  74,  74],
          ...,
          [ 79,  78,  73,  ..., 255, 255, 255],
          [ 75,  74,  72,  ..., 255, 255, 255],
          [ 68,  77,  72,  ..., 255, 255, 255]],

         [[ 74,  74,  74,  ...,  74,  74,  74],
          [ 74,  74,  74,  ...,  74,  74,  74],
          [ 74,  74,  74,  ...,  74,  74,  74],
          ...,
          [ 74,  70,  72,  ..., 147, 142, 145],
          [ 70,  65,  73,  ..., 163, 152, 149],
          [ 67,  66,  70,  ..., 180, 153, 137]]]], dtype=torch.uint8)

In [51]:
ani

## Unused for now

In [60]:
env = gym.make('deepracer_gym:deepracer-v0',  port=8889)

obs = env.reset()

print("Deepracer Environment Connected succesfully")

steps_completed = 0
episodes_completed = 0
total_reward = 0
rewards = []
saved_obs = []

while episodes_completed < 1:

    # init
    if '_next_state' in obs:
        observation = obs['_next_state']
        info = {}

    print(observation)
    # let agent select next state
    #action = agent.decide(observation)


    # step in action
    observation, reward, done, info = env.step(0)
    steps_completed += 1
    total_reward += reward

    # store information for training
    #agent.observe(observation, action, reward)

    # episode completion
    if done:
        episodes_completed += 1
        print("Episodes Completed:", episodes_completed,
              "Steps:", steps_completed, "Reward", total_reward)
        steps_completed = 0
        rewards.append(total_reward)
        total_reward = 0
        #agent.train()

Deepracer Environment Connected succesfully
{'STEREO_CAMERAS': array([[[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       [[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       [[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       ...,

       [[166, 146],
        [168, 162],
        [165, 160],
        ...,
        [129, 164],
        [133, 159],
        [142, 157]],

       [[171, 158],
        [180, 172],
        [178, 163],
        ...,
        [148, 149],
        [142, 148],
        [133, 144]],

       [[175, 167],
        [178, 164],
        [167, 150],
        ...,
        [142, 149],
        [137, 141],
        [136, 146]]], dtype=uint8)}
{'STEREO_CAMERAS': array([[[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        

{'STEREO_CAMERAS': array([[[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       [[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       [[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       ...,

       [[166, 146],
        [168, 162],
        [165, 160],
        ...,
        [129, 164],
        [133, 159],
        [142, 157]],

       [[171, 158],
        [180, 172],
        [178, 163],
        ...,
        [148, 149],
        [142, 148],
        [133, 144]],

       [[175, 167],
        [178, 164],
        [167, 150],
        ...,
        [142, 149],
        [137, 141],
        [136, 146]]], dtype=uint8)}
{'STEREO_CAMERAS': array([[[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74

{'STEREO_CAMERAS': array([[[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       [[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       [[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74,  74]],

       ...,

       [[166, 146],
        [168, 162],
        [165, 160],
        ...,
        [129, 164],
        [133, 159],
        [142, 157]],

       [[171, 158],
        [180, 172],
        [178, 163],
        ...,
        [148, 149],
        [142, 148],
        [133, 144]],

       [[175, 167],
        [178, 164],
        [167, 150],
        ...,
        [142, 149],
        [137, 141],
        [136, 146]]], dtype=uint8)}
{'STEREO_CAMERAS': array([[[ 74,  74],
        [ 74,  74],
        [ 74,  74],
        ...,
        [ 74,  74],
        [ 74,  74],
        [ 74

In [51]:
from stable_baselines3.common.policies import ActorCriticCnnPolicy
from typing import Callable

def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        return progress_remaining * initial_value

    return func

act_space = gym.spaces.Discrete(5)
obs_space = gym.spaces.Box(low=0, high=255, shape=(1,16,16), dtype=np.uint8)
schedule = linear_schedule(0.001)

#policy = ActorCriticCnnPolicy(obs_space, act_space, schedule, use_sde=False,**policy_kwargs)

In [66]:
observation['STEREO_CAMERAS'].shape

(120, 160, 2)

In [None]:
{'STEREO_CAMERAS': (120,160,2), dtype='uint8'}

In [2]:
class BasicAgent(DeepracerAgent):
    def __init__(self, gamma=0.99, policy_learning_rate=0.002, value_learning_rate=0.002):

        self.plr = policy_learning_rate
        self.vlr = value_learning_rate
        self.gamma = gamma

        self.actions_prob = []
        self.saved_rewards = []
        self.model = None
        self.value_model = None
        self.n_moving_avg = 5

        # These lists stores the cumulative observations for this episode
        self.episode_observations, self.episode_actions, self.episode_rewards = [], [], []

        # Build the keras network
        self._build_network()

    def _convert_obs(self, observation):
        return segment_resize(observation['STEREO_CAMERAS'][:, :, 0])

    def observe(self, state, action, reward):
        """ This function takes the observations the agent received from the environment and stores them
            in the lists above."""
        self.episode_observations.append(
            self._convert_obs(state)[:, :, np.newaxis])
        self.episode_actions.append(action)
        self.episode_rewards.append(reward)

    def decide(self, state):
        """ This function feeds the observed state to the network, which returns a distribution
            over possible actions. Sample an action from the distribution and return it."""

        probs = np.ravel(self.model(self._convert_obs(state)
                         [np.newaxis, :, :, np.newaxis]))
        return np.random.choice(len(probs), p=probs)

    def register_reset(self, observations):
        action = np.random.randint(5)
        return action

    def compute_action(self, observations, info):
        action = np.random.randint(5)
        return action

    def train(self):
        """ When this function is called, the accumulated episode observations, actions and discounted rewards
            should be fed into the network and used for training. Use the _get_returns function to first turn 
            the episode rewards into discounted returns. 
            Apply simple or adaptive baselines if needed, depending on parameters."""

        states = np.stack(self.episode_observations)
        discounted = self._get_returns().reshape((-1, 1))

        # compute baseline and train
        baseline = self.value_model.predict_on_batch(states)
        self.value_model.train_on_batch(states, discounted.reshape((-1, 1)))

        # compute discounted rewards and remove baseline
        discounted_rewards = discounted - baseline

        # train model
        self.model.train_on_batch(
            states,
            tf.keras.utils.to_categorical(self.episode_actions, num_classes=5),
            sample_weight=discounted_rewards
        )
        # reset observations for next episode
        self.episode_observations, self.episode_actions, self.episode_rewards = [], [], []

    def moving_average(self, discounted):
        # save the mean of the rewards for the moving average
        self.saved_rewards.append(np.mean(discounted))
        # only keep last n averages
        if len(self.saved_rewards) > self.n_moving_avg:
            self.saved_rewards = self.saved_rewards[1:]
        # compute baseline from the moving average
        return np.mean(self.saved_rewards)

    def _get_returns(self):
        """ This function should process self.episode_rewards and return the discounted episode returns
            at each step in the episode. Hint: work backwards."""

        discounted_rewards = []
        cumulative_total_return = 0
        # iterate the rewards backwards and and calc the total return
        for reward in self.episode_rewards[::-1]:
            cumulative_total_return = (
                cumulative_total_return*self.gamma)+reward
            discounted_rewards.append(cumulative_total_return)
        # reverse sequence
        discounted_rewards.reverse()
        # convert to numpy array
        return np.array(discounted_rewards)

    def _build_network(self):
        """ This function should build the network that can then be called by decide and train. 
            The network takes observations as inputs and has a policy distribution as output."""

        model = Sequential()
        model.add(Conv2D(32, (3, 3), activation='relu',
                  kernel_initializer='he_uniform', input_shape=(16, 16, 1)))
        model.add(MaxPooling2D((2, 2)))
        model.add(Flatten())
        model.add(Dense(20, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(5, activation='softmax'))
        model.compile(loss="categorical_crossentropy",
                      optimizer=Adam(learning_rate=self.plr))

        value_model = Sequential()
        value_model.add(Conv2D(32, (3, 3), activation='relu',
                        kernel_initializer='he_uniform', input_shape=(16, 16, 1)))
        value_model.add(MaxPooling2D((2, 2)))
        value_model.add(Flatten())
        value_model.add(Dense(20, activation='relu',
                        kernel_initializer='he_uniform'))
        value_model.add(Dense(1, activation='softmax'))
        value_model.compile(loss='mean_squared_error',
                            optimizer=Adam(learning_rate=self.vlr))

        self.model = model
        self.value_model = value_model

ModuleNotFoundError: No module named 'agents'