**Initialise**

In [None]:
# this is a Deep Q Learning (DQN) agent including prioritized experience replay memory and a target network
# The DQN uses a 4-frame stack of greyscale 84x84 images as states, to learn movement and accomodate efficient learning
# The architecture and parameters are based off the original DQN paper, https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
# a rank-based (prioritized) experience replay is used, first discussed in https://arxiv.org/abs/1511.05952
# due to RAM restrictions on colab we use a 50000 buffer limit. Preferbly this would be larger. Preferbly we would train for much longer (>10mil frames)
# code inspired by https://github.com/higgsfield/RL-Adventure/blob/master/4.prioritized%20dqn.ipynb

# imports
import gym
import math 
import collections
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# hyperparameters
learning_rate = 0.0001
gamma         = 0.99
buffer_limit  = 50000 
batch_size    = 32
video_every   = 25
print_every   = 5
save_model    = True  # Disable this if you don't want to save models to drive


if save_model: 
    from google.colab import drive 
    drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Gym Wrappers**
Taken from https://github.com/higgsfield/RL-Adventure/blob/master/common/wrappers.py

In [None]:
class FireResetEnv(gym.Wrapper):
	def __init__(self, env=None):
		"""For environments where the user need to press FIRE for the
		game to start."""
		super(FireResetEnv, self).__init__(env)
		assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
		assert len(env.unwrapped.get_action_meanings()) >= 3

	def step(self, action):
		return self.env.step(action)

	def reset(self):
		self.env.reset()
		obs, _, done, _ = self.env.step(1)
		if done:
			self.env.reset()
		obs, _, done, _ = self.env.step(2)
		if done:
			self.env.reset()
		return obs


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

**Model architecture + Prioritized experience**

In [None]:
class PrioritizedReplayBuffer():
    def __init__(self, prob_alpha=0.6):
        self.buffer = collections.deque(maxlen=buffer_limit)
        self.priorities = np.zeros((buffer_limit), dtype=np.float32)
        self.pos = 0 
        self.prob_alpha = prob_alpha

    
    def put(self, transition):
        max_prio = self.priorities.max() if self.buffer else 1.0 # set to 1 if empty
        self.buffer.append(transition) 
        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % buffer_limit


    def sample(self, n, beta=0.4): # β = 0.4 
        if len(self.buffer) == buffer_limit:
            prios = self.priorities 
        else:
            prios = self.priorities[:self.pos]
         
        probs = prios ** self.prob_alpha # p_{i}^{α}
        probs = probs / probs.sum() # pr(i) = p_{i}^{α} / sum(p^{α})  
        indices = np.random.choice(len(self.buffer), batch_size, p=probs) 
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer) 
        weights = (total * probs[indices]) ** (-beta)  # (N * pr(i))^β equivalent to (1/N * 1/pr(i))^β
        weights = weights / weights.max() 
        weights = np.array(weights, dtype=np.float32) 
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        for i in samples:
            s_lst.append(i[0]) 
            a_lst.append([i[1]]) 
            r_lst.append([i[2]]) 
            s_prime_lst.append(i[3]) 
            done_mask_lst.append([i[4]])
        
        return torch.stack(s_lst), torch.tensor(a_lst), torch.tensor(r_lst), \
               torch.stack(s_prime_lst), torch.tensor(done_mask_lst), indices, weights 
    
    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):  
            self.priorities[idx] = max(prio)  
   
    def size(self):
        return len(self.buffer)

class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )

        self.linear = nn.Sequential(
            nn.Linear(7*7*64, 512),
            nn.ReLU(),
            nn.Linear(512, env.action_space.n)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.shape[0], -1) 
        x = self.linear(x) 
        return x 
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,env.action_space.n-1)
        else : 
            return out.argmax().item()
            
def train(q, q_target, memory, optimizer, beta):
    for i in range(10):

        s,a,r,s_prime,done_mask,indices,weights = memory.sample(batch_size, beta)
        s, s_prime = torch.squeeze(s), torch.squeeze(s_prime)
 
        q_out = q(s)
        q_a = q_out.gather(1,a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = (F.smooth_l1_loss(q_a, target, reduction='none')) * torch.from_numpy(weights)
        prios = loss + 1e-5
        loss = torch.mean(loss) 

        optimizer.zero_grad()
        loss.backward()
        memory.update_priorities(indices, prios.data.cpu().numpy())
        optimizer.step()

In [None]:
# Slowed down the rate of annealing to accomodate for slower learning. (training every 10 frames rather than every frame)

# Epsilon
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * (frame_idx*0.1) / epsilon_decay)

# Beta
beta = 0.4 
beta_start = 0.4
beta_frames = 100000
beta_by_frame = lambda frame_idx: min(1.0, beta_start + (frame_idx*0.1) * (1.0 - beta_start) / beta_frames)

**Train**

In [None]:
from gym.wrappers import GrayScaleObservation, ResizeObservation, FrameStack

# setup the Gravitar ram environment, and record a video every 50 episodes.
env = gym.make('Gravitar-v0')
env = gym.wrappers.Monitor(env, "./video", video_callable=lambda episode_id: (episode_id%video_every)==0,force=True)


# reproducible environment and action spaces, do not change lines 6-11 here (tools > settings > editor > show line numbers)
seed = 742
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

q = QNetwork()
q_target = QNetwork()
q_target.load_state_dict(q.state_dict())

# Load state dict from memory 
#q.load_state_dict(torch.load(f"drive/MyDrive/q-models/q-300.pth"))
#q_target.load_state_dict(torch.load(f"drive/MyDrive/q-models/q_target-300.pth"))
####

memory = PrioritizedReplayBuffer()
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

frame_idx = 0 # set to where colab crashes, else 0  
score = 0.0 
marking = [] 
replay_initial = 10000

# Apply wrappers
env = NoopResetEnv(env)
env = GrayScaleObservation(env) 
env = ResizeObservation(env, (84,84)) 
env = FrameStack(env, num_stack=4)
env = FireResetEnv(env)  

for n_episode in range(int(1e32)):
    s = env.reset() 
    done = False 
    score = 0.0 

    while True: 
        frame_idx += 1 
        epsilon = epsilon_by_frame(frame_idx) # linear annealing from 100% to 1%
        s = torch.from_numpy(np.moveaxis(s, [0,1,2,3], [1,2,3,0])).float()
        a = q.sample_action(s, epsilon)

        s_prime, r, done, info = env.step(a) 
        done_mask = 0.0 if done else 1.0 
        memory.put((s,a,r/100.0,(torch.from_numpy(np.moveaxis(s_prime, [0,1,2,3], [1,2,3,0])).float()), done_mask))
        
        s = s_prime 
        score += r 

        if done:
          break

        if memory.size() > replay_initial and frame_idx % 10 == 0: # Train/learn every 10 frames to speed up training. Ideally train on every frame
            beta = beta_by_frame(frame_idx)
            train(q, q_target, memory, optimizer, beta)
        
        if frame_idx % 1000 == 0:
            q_target.load_state_dict(q.state_dict())
        
    
    # do not change lines 44-48 here, they are for marking the submission log
    marking.append(score)
    if n_episode%100 == 0:
        print("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
            n_episode, score, np.array(marking).mean(), np.array(marking).std()))
        marking = []
    
    if n_episode % 10 == 0 and save_model == True:
        torch.save(q.state_dict(), f"drive/MyDrive/q-models/q-{n_episode}.pth")
        torch.save(q_target.state_dict(), f"drive/MyDrive/q-models/q_target-{n_episode}.pth")

    if n_episode%print_every==0 and n_episode!=0:
        print("episode: {}, score: {:.1f}, epsilon: {:.2f}, beta: {:.2f}, frame_idx : {:.1f}".format(n_episode, score, epsilon, beta, frame_idx))
          

marking, episode: 0, score: 100.0, mean_score: 100.00, std_score: 0.00
episode: 5, score: 500.0, epsilon: 0.45, beta: 0.40, frame_idx : 241145.0
episode: 10, score: 0.0, epsilon: 0.45, beta: 0.55, frame_idx : 245680.0
episode: 15, score: 0.0, epsilon: 0.44, beta: 0.55, frame_idx : 249464.0
episode: 20, score: 100.0, epsilon: 0.44, beta: 0.55, frame_idx : 252913.0
episode: 25, score: 350.0, epsilon: 0.43, beta: 0.55, frame_idx : 256716.0
