**Dependencies and setup**

This can take a minute or so...

In [None]:
# overall training structure based on http://bicmr.pku.edu.cn/~wenzw/bigdata/lect-dyna3w.pdf
# policy and agents based on https://towardsdatascience.com/td3-learning-to-run-with-ai-40dfc512f93, under MIT license

%%capture
!apt update
!pip install 'gym[box2d]'
!apt install xvfb -y
!pip install pyvirtualdisplay

import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
from pyvirtualdisplay import Display
from IPython import display as disp
import copy
from collections import deque

%matplotlib inline

display = Display(visible=0,size=(600,600))
display.start()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

plot_interval = 10 # update the plot every N episodes
video_every = 25 # videos can take a very long time to render so only do it every N episodes


In [None]:
# optional Google drive integration - this will allow you to save and resume training, and may speed up redownloading the dataset
from google.colab import drive
drive.mount('/content/drive')
# import os
# os.mkdir('drive/MyDrive/TD3-Outputs/')

In [None]:
BATCH_SIZE = 100
DISCOUNT_FACTOR = 0.99
EXPLORE_POLICY = 0.1
LEARN_RATE = .001
POLICY_DELAY = 2
TAU = 0.005
NOISE_POLICY = 0.2
NOISE_CLIP = 0.5

**Reinforcement learning agent**

Replace this with your own agent - I recommend starting with TD3 (lecture 8)

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_actions):
        super(Actor, self).__init__()

        # 3-layer linear nn
        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)
        self.max_act = max_actions

    def forward(self, state):
        x = F.relu(self.l1(state))
        x = F.relu(self.l2(x))
        x = self.max_act * torch.tanh(self.l3(x))
        return x

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        # initialise q1 and q2 networks
        super(Critic, self).__init__()
        # Q1..
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1) 
        # Q2..
        self.l4 = nn.Linear(state_dim + action_dim, 400)
        self.l5 = nn.Linear(400, 300)
        self.l6 = nn.Linear(300, 1) 
        
    def forward(self, s, a): 
        sa = torch.cat([s, a], 1) # state action pair
        # Q1..
        c1 = F.relu(self.l1(sa))
        c1 = F.relu(self.l2(c1))
        c1 = self.l3(c1)
        # Q2..
        c2 = F.relu(self.l4(sa))
        c2 = F.relu(self.l5(c2))
        c2 = self.l6(c2)
        # return both results so smaller can be used
        return (c1, c2)

class TD3():
    # td3 (e.g. twin ddpg) agent, ready to be instanciated into a policy
    def __init__(self, state_dim, action_dim, max_action, env, device):
        super(TD3, self).__init__()

        # call actor network...
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=LEARN_RATE)
        self.device = device

        # call critic network...
        self.critic = Critic(state_dim, action_dim).to(device) # only needs state + action
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=LEARN_RATE)
        self.max_action = max_action
        self.env = env

    def select_action(self, state, noise=0.1): 
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        action = self.actor(state).cpu().data.numpy().flatten() # determines next action based on current state and policy model
        if(noise == EXPLORE_POLICY): 
            action = (action + np.random.normal(0, noise, size=self.env.action_space.shape[0])) # adds some noise from distribution to policy

        return self.actor(state).cpu().data.numpy().flatten()


    def save(self):
        torch.save(self.actor.state_dict(), 'drive/MyDrive/TD3-Outputs/td3_actor_finalized.pth')
        torch.save(self.critic.state_dict(), 'drive/MyDrive/TD3-Outputs/td3_critic_finalized.pth')
        return
    
    def load(self):
        self.actor.load_state_dict(torch.load('./td3_actor_finalized.pth',  map_location=torch.device('cpu')))
        self.critic.load_state_dict(torch.load('./td3_critic_finalized.pth',  map_location=torch.device('cpu')))
        return

    def train(self, replay_buffer, current_iteration): 
        state, action, reward, next_state, done = replay_buffer.sample() # sample random transitions
        # to improve: implement a prioritised replay buffer so not entirely stochastic
        
        tensor_cpy = action.clone().detach()
        noise = tensor_cpy.normal_(0, NOISE_POLICY).clamp(-NOISE_CLIP, NOISE_CLIP) # adds some noise
      
        next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action) # clips action and noise sum within +/- maximum
        
        # Which Qs are we aiming for?:
        target_q1, target_q2 = self.critic_target(next_state, next_action)
        target_q = ((torch.min(target_q1, target_q2)) * (1-done)) + reward
        curr_q1, curr_q2 = self.critic(state, action)

        # learn with MSE loss regression
        critic_loss = F.mse_loss(curr_q1, target_q) + F.mse_loss(curr_q2, target_q)
        self.critic_optimizer.zero_grad() # ignore any previously learnt gradients here
        critic_loss.backward()
        self.critic_optimizer.step()

        if (current_iteration % POLICY_DELAY == 0): # one in two actions
            
            actor_loss = -self.critic(state, self.actor(state))[0].mean() # gradient ascent based on critic output

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):    
                target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)



In [None]:
class ExperienceReplay:
    # a memory buffer to store previous state / action experience for future random sampling
    def __init__(self, buffer_size, batch_size, device):
        self.buffer = deque(maxlen=buffer_size)
        self.batch_size= batch_size
        self.device = device
        self.ptr = 0

    def __len__(self): # rewrite magic
        return len(self.buffer)

    # Add a transition to the memory
    def store_transition(self, state, action, reward, new_state, done):
        if self.ptr < self.buffer.maxlen:         # is buffer full yet?
            self.buffer.append((state, action, reward, new_state, done))
        else: 
            self.buffer[int(self.ptr)] = (state, action, reward, new_state, done)
            self.ptr = (self.ptr + 1) % self.buffer.maxlen

    # sample memory
    def sample(self):
        sample = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*sample)
        states = torch.from_numpy(np.array(states, dtype=np.float32)).to(self.device)
        actions = torch.from_numpy(np.array(actions, dtype=np.float32)).to(self.device)
        rewards = torch.from_numpy(np.array(rewards, dtype=np.float32).reshape(-1, 1)).to(self.device)
        next_states = torch.from_numpy(np.array(next_states, dtype=np.float32)).to(self.device)
        dones = torch.from_numpy(np.array(dones, dtype=np.uint8).reshape(-1, 1)).float().to(self.device)
        return (states, actions, rewards, next_states, dones)

**Prepare the environment and wrap it to capture videos**

In [None]:
%%capture
env = gym.make("BipedalWalker-v3")
# env = gym.make("Pendulum-v0") # useful continuous environment for quick experiments
# env = gym.make("BipedalWalkerHardcore-v3") # only attempt this if your agent consistently aces BipedalWalker-v3
env = gym.wrappers.Monitor(env, "drive/MyDrive/TD3-Outputs/video", video_callable=lambda ep_id: ep_id%video_every == 0, force=True)

obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

In [None]:
print('The environment has {} observations and the agent can take {} actions'.format(obs_dim, act_dim))
print('The device is: {}'.format(device))

if device.type != 'cpu': print('It\'s recommended to train on the cpu for this')

In [None]:
from numpy.core.fromnumeric import take
# in the submission please use seed 42 for verification
seed = 42
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

# logging variables
ep_reward = 0
reward_list = []
plot_data = []
log_f = open("drive/MyDrive/TD3-Outputs/agent-log.txt","w+")

# aditional variables
buffer_size = 1000000
batch_size = 100
noise = 0.1
max_action = float(env.action_space.high[0])

# initialise agent
agent = TD3(obs_dim, act_dim, max_action, env, device)


agent.load()
buffer = ExperienceReplay(buffer_size, batch_size, device)
# per = PrioritsedExperienceReplay(buffer_size)

save_score = 400
max_episodes = 10000
max_timesteps = 2000
explore_timesteps_1 = 1000
explore_timesteps_2 = 1500

best_reward = -999999999999999999
scores_over_episodes = []

state = env.reset()
# training procedure:
for episode in range(1, max_episodes+1):
    if episode < 50:
      # fast exploration phase, builds up replay buffer
        T = explore_timesteps_1
    elif episode < 100:
        T = explore_timesteps_2
    else:
        T = max_timesteps
    ep_reward = 0
    # state = env.reset()
    for t in range(T):

        # select the agent action
        action = agent.select_action(state) + np.random.normal(0, max_action * noise, size=act_dim)
        action = action.clip(env.action_space.low, env.action_space.high)

        # take action in environment and get r and s'
        next_state, reward, done, _ = env.step(action)
        buffer.store_transition(state, action, reward, next_state, done)
        # per.add((state,action,reward,next_state,done),reward)
        state = next_state
        ep_reward += reward

        if len(buffer) > batch_size:
        # if len(per) > batch_size:
            agent.train(buffer, t)
            # agent.train(per, t)
        
        # stop iterating when the episode finished
        if done or t>=(T-1):
            scores_over_episodes.append(ep_reward)
            try: # tries in case episode is not over
                state = env.reset()
                break
            except:
                pass
    
    if(np.mean(scores_over_episodes[-50:]) > save_score):
        print('Saving agent- past 50 scores gave better avg than ', save_score)
        best_reward = np.mean(scores_over_episodes[-50:])
        save_score = best_reward
        agent.save()
        break # Saved agent. Break out of episodes and end, 400 is pretty good. 

    if(episode >= 0 and ep_reward > best_reward):
        # print('Saving agent- score for this episode was better than best-known score..')
        best_reward = ep_reward
        agent.save() # Save current policy + optimizer

    # append the episode reward to the reward list
    reward_list.append(ep_reward)

    # do NOT change this logging code - it is used for automated marking!
    log_f.write('episode: {}, reward: {}\n'.format(episode, ep_reward))
    log_f.flush()
    ep_reward = 0

    
    
    # print reward data every so often - add a graph like this in your report
    if episode % plot_interval == 0:
        plot_data.append([episode, np.array(reward_list).mean(), np.array(reward_list).std()])
        reward_list = []
        # plt.rcParams['figure.dpi'] = 100
        plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], '-', color='tab:grey')
        plt.fill_between([x[0] for x in plot_data], [x[1]-x[2] for x in plot_data], [x[1]+x[2] for x in plot_data], alpha=0.2, color='tab:grey')
        plt.xlabel('Episode number')
        plt.ylabel('Episode reward')
        plt.show()
        disp.clear_output(wait=True)
 


# Unnsuccesful attempt at PER...
#### Retained for reference / proof of experimentation attempts


In [None]:
# below code based on https://github.com/djmax008/GEIRINA_baseline/blob/master/prioritized_memory.py, under MIT license

class SumTree():

    data_pointer = 0
    data_length = 0
    
    def __init__(self, capacity):

        self.capacity = int(capacity)
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
    
    def __len__(self):
        return self.data_length
    
    def add(self, data, priority):
        tree_index = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data
        self.update (tree_index, priority)
        self.data_pointer += 1
        if self.data_pointer >= self.capacity:
            self.data_pointer = 0
        if self.data_length < self.capacity:
            self.data_length += 1
    
    def update(self, tree_index, priority):
        change = priority - self.tree[tree_index]
        self.tree[tree_index] = priority
        
        while tree_index != 0:
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += change    
    
    def get_leaf(self, v):
        parent_index = 0
        
        while True: 
            left_child_index = 2 * parent_index + 1
            right_child_index = left_child_index + 1
            
            if left_child_index >= len(self.tree):
                leaf_index = parent_index
                break
            else: 
                if v <= self.tree[left_child_index]:
                    parent_index = left_child_index
                else:
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index
        
        data_index = leaf_index - self.capacity + 1
        
        return leaf_index, self.tree[leaf_index], self.data[data_index]
    
    @property
    def total_priority(self):
        return self.tree[0]  

class PER():  
    epsilon = 0.01 
    alpha = 0.6 
    beta = 0.4  
    beta_increment_per_sampling = 1e-4 
    absolute_error_upper = 1.  
    
    def __init__(self, capacity):
        self.tree = SumTree(capacity)
    
    def __len__(self):
        return len(self.tree)
    
    def is_full(self):
        return len(self.tree) >= self.tree.capacity
    
    def add(self, sample, error = None):
        if error is None:
            priority = np.amax(self.tree.tree[-self.tree.capacity:])
            if priority == 0: priority = self.absolute_error_upper
        else:
            priority = min((abs(error) + self.epsilon) ** self.alpha, self.absolute_error_upper)
        self.tree.add(sample, priority)
    
    def sample(self, n):

        minibatch = []
        
        idxs = np.empty((n,), dtype=np.int32)
        is_weights = np.empty((n,), dtype=np.float32)

        priority_segment = self.tree.total_priority / n    
        
        self.beta = np.amin([1., self.beta + self.beta_increment_per_sampling])  # max = 1
        
        p_min = np.amin(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n) ** (-self.beta)
        
        for i in range(n):

            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            

            index, priority, data = self.tree.get_leaf(value)
            
            sampling_probabilities = priority / self.tree.total_priority
            is_weights[i] = np.power(n * sampling_probabilities, -self.beta)/ max_weight
            
            idxs[i]= index
            minibatch.append(data)
            
        return idxs, minibatch, is_weights
    
    def batch_update(self, idxs, errors):

        errors = errors + self.epsilon
        clipped_errors = np.minimum(errors, self.absolute_error_upper)
        ps = np.power(clipped_errors, self.alpha)
        
        for idx, p in zip(idxs, ps):
            self.tree.update(idx, p)