### Introduction

In [1]:
'''
This is my Reinforcement Learning submission for playing the notoriously hard
Gravitar game, Atari. In all honesty, when familiarising myself with and playing the
game myself, I struggled to get a score higher than 200, so I suspected a simple
DQN approach would be quite pathetic, and it was time for the big guns,
as backed by research papers.  Although relatively old, a relevant paper I found
an interesting read is https://arxiv.org/pdf/1710.02298.pdf.

I was really determined to go down the policy gradient appraoch initially, and
got together an A2C Actor Critic algorithm. The majority of my time was spent trying
to implement a Self Imitation Learning (SIL) component to this, from the paper 
https://arxiv.org/abs/2012.11989. I unfortunately struggled to get it working, 
and am quite disappointed about that. Because my stubborn nature desperately 
wanted to get it working, by the time I decided to try a different approach, 
I was running out of time. So although I flowered up a regular DQN algorithm,
my final product isn't as complex as I would have liked, so I apologise.

My final implementation made use of https://arxiv.org/abs/1511.06581, the initial
RL assignment template, https://github.com/dxyang/DQN_pytorch and 
https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26.

Of course, not expecting marks from this, but if interested in scanning over 
my A2C SIL attempt, it is here (very messy admittedly): 
https://colab.research.google.com/drive/1aMrBq2mTJdHaCcUvmENte7HXzaPQ3OtO?usp=sharing

Thanks for reading over and marking! This has been super enjoyable and a real challenge.

'''

"\nThis is my Reinforcement Learning submission for playing the notoriously hard\nGravitar game, Atari. In all honesty, when familiarising myself with and playing the\ngame myself, I struggled to get a score higher than 200, so I suspected a simple\nDQN approach would be quite pathetic, and it was time for the big guns,\nas backed by research papers.  Although relatively old, a relevant paper I found\nan interesting read is https://arxiv.org/pdf/1710.02298.pdf.\n\nI was really determined to go down the policy gradient appraoch initially, and\ngot together an A2C Actor Critic algorithm. The majority of my time was spent trying\nto implement a Self Imitation Learning (SIL) component to this, from the paper \nhttps://arxiv.org/abs/2012.11989. I unfortunately struggled to get it working, \nand am quite disappointed about that. Because my stubborn nature desperately \nwanted to get it working, by the time I decided to try a different approach, \nI was running out of time. So although I flow

### Code

In [2]:
import gym
import cv2
import time
import json
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import collections
from collections import deque

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
video_every = 7
print_every = 1
env = gym.make("Gravitar-v0")
env = gym.wrappers.Monitor(env, "./video", video_callable=lambda episode_id: (episode_id%video_every)==0,force=True)
seed = 742
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)
gamma = 0.98
lr = 0.00025
epsilon = 1
eps_decay = 0.9998
eps_min = 0.025
batch_size = 32
buffer_limit = 1000
h = env.observation_space.shape[0]
w = env.observation_space.shape[1]

In [4]:
def resize_decolourise(image):
    image = image[::2, ::2]
    image = np.mean(image, axis = 2).astype(np.uint8) // 2
    return image
def get_tensor(pre_tensor, dtype):
    if dtype == "float":
        return torch.FloatTensor(pre_tensor).to(device)
    elif dtype == "long":
        return torch.LongTensor(pre_tensor).to(device)

In [5]:
class ReplayBuffer():
  def __init__(self):
    self.buffer = collections.deque(maxlen=buffer_limit)
  
  def size(self):
        return len(self.buffer)

  def put(self, s, action, r, s_prime, done):
    self.buffer.append([s[None, :], action, r, s_prime[None, :], done])

In [6]:
# simple block of convolution, batchnorm, and relu
class ConvBlock(nn.Module):
    def __init__(self, in_f, out_f, in_kernel, in_stride):
        super(ConvBlock, self).__init__()
        self.f = nn.Sequential(
            nn.Conv2d(in_f, out_f, kernel_size=in_kernel, stride = in_stride),
            nn.BatchNorm2d(out_f),
            nn.ReLU(inplace=True)
        )
    def forward(self,x):
        return self.f(x)
        
# simple block of convolution, batchnorm, and leakyrelu
class LinearBlock(nn.Module):
    def __init__(self, in_f, out_f):
        super(LinearBlock, self).__init__()
        self.f = nn.Sequential(
            nn.Linear(in_features=in_f, out_features=128),
            nn.LeakyReLU(inplace=True),
            nn.Linear( 128, out_f)
        )
    def forward(self,x):
        return self.f(x)

In [7]:
class ConvDuelingDQN(nn.Module):
    def __init__(self, h, w, output_size):
        super(ConvDuelingDQN, self).__init__()
        self.conv = nn.Sequential(
            ConvBlock(4, 32, 8, 4),
            ConvBlock(32, 64, 4, 2),
            ConvBlock(64, 64, 3, 1)
        )

        self.advantage = nn.Sequential(
            LinearBlock(3456, output_size) #3456 is a result of 6 * 9 *64
        )
        self.value = nn.Sequential(
            LinearBlock(3456, 1)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        state_advantage = self.advantage(x)
        state_value = self.value(x)
        return state_value + (state_advantage - state_advantage.mean())

In [8]:
class ActModel():
    def __init__(self):
        self.primary_model = ConvDuelingDQN(h=105, w=80, output_size=env.action_space.n).to(device)
        self.target_model = ConvDuelingDQN(h=105, w=80, output_size=env.action_space.n).to(device)
        self.target_model.load_state_dict(self.primary_model.state_dict())
        self.target_model.eval()
        self.optimizer = optim.Adam(self.primary_model.parameters(), lr=lr)

    def sample_action(self, s):
        if random.uniform(0, 1) <= epsilon:
            return random.randrange(env.action_space.n)
        else:
            with torch.no_grad():
                s = get_tensor(s, "float").unsqueeze(0)
                q_values = self.primary_model.forward(s)
                return torch.argmax(q_values).item()

    def train(self, memory):
        s, action, r, s_prime, done = zip(*random.sample(memory.buffer, batch_size))
        s = np.concatenate(s)
        s_prime = np.concatenate(s_prime)

        s = get_tensor(s, "float")
        s_prime = get_tensor(s_prime, "float")
        action = get_tensor(action, "long")
        r = get_tensor(r, "float")
        done = get_tensor(done, "float")

        s_q = self.primary_model(s)
        s_prime_q = self.primary_model(s_prime)
        s_prime_target_q= self.target_model(s_prime)

        current_q = s_q.gather(1, action.unsqueeze(1)).squeeze(1)
        s_prime_target_q = s_prime_target_q.gather(1, s_prime_q.max(1)[1].unsqueeze(1)).squeeze(1)
        
        #Bellman equation
        expected_q = r + gamma * s_prime_target_q * (1 - done)

        loss = (current_q - expected_q.detach()).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        

In [None]:
actmodel = ActModel()
memory = ReplayBuffer()
marking = []
last_100_r = deque(maxlen=100)
cumulative_steps = 1 

for n_episode in range(int(1e32)):
    s = resize_decolourise(env.reset()) 
    s = np.stack((s, s, s, s))

    r_total = 0
    episode_steps = 1
    while True:

        action = actmodel.sample_action(s)  # Act
        s_prime, r, done, _ = env.step(action)  # Observe
        s_prime = resize_decolourise(s_prime)  # Process image
        s_prime = np.stack((s_prime, s[0], s[1], s[2]))

        # Store the transition in memory
        memory.put(s, action, r, s_prime, done)  # Store to mem

        # Move to the next state
        s = s_prime  # Update state

        # Perform one step of the optimization (on the target network)
        if memory.size() > 20000:
            actmodel.train(memory)

        r_total += r
        cumulative_steps += 1
        episode_steps+=1
        if cumulative_steps % 1000 == 0:
            epsilon = max(eps_min, epsilon*eps_decay )
        if done:
            break

    # Train model
    actmodel.target_model.load_state_dict(actmodel.primary_model.state_dict())  # Update target model
    last_100_r.append(r_total)
    score = r_total
    marking.append(score)

    if n_episode%100 == 0:
        print("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
            n_episode, score, np.array(marking).mean(), np.array(marking).std()))
        marking = []

    if n_episode%print_every==0:
        print("Episode:{} Score:{:.2f} Last_100_Avg_Rew:{:.3f} Epsilon:{:.2f} Steps:{}".format(
            n_episode, score, np.mean(last_100_r), epsilon, episode_steps))

marking, episode: 0, score: 200.0, mean_score: 200.00, std_score: 0.00
Episode:0 Score:200.00 Last_100_Avg_Rew:200.000 Epsilon:1.00 Steps:739
Episode:1 Score:0.00 Last_100_Avg_Rew:100.000 Epsilon:1.00 Steps:739
Episode:2 Score:250.00 Last_100_Avg_Rew:150.000 Epsilon:1.00 Steps:747
Episode:3 Score:250.00 Last_100_Avg_Rew:175.000 Epsilon:1.00 Steps:764
Episode:4 Score:350.00 Last_100_Avg_Rew:210.000 Epsilon:1.00 Steps:753
Episode:5 Score:0.00 Last_100_Avg_Rew:175.000 Epsilon:1.00 Steps:751
Episode:6 Score:250.00 Last_100_Avg_Rew:185.714 Epsilon:1.00 Steps:748
Episode:7 Score:0.00 Last_100_Avg_Rew:162.500 Epsilon:1.00 Steps:754
Episode:8 Score:350.00 Last_100_Avg_Rew:183.333 Epsilon:1.00 Steps:1170
Episode:9 Score:250.00 Last_100_Avg_Rew:190.000 Epsilon:1.00 Steps:1233
Episode:10 Score:0.00 Last_100_Avg_Rew:172.727 Epsilon:1.00 Steps:753
Episode:11 Score:350.00 Last_100_Avg_Rew:187.500 Epsilon:1.00 Steps:821
Episode:12 Score:100.00 Last_100_Avg_Rew:180.769 Epsilon:1.00 Steps:790
Episode:1