# Deep Reinforcement Learning
In this lab we will implement and train an agent that uses deep learning to play balance the stick in `CartPole-v1`.

## Setup
----
We import useful packages: `gym`, `torch` stuff, etc..

Imports:

In [1]:
import gym

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from collections import deque  # for memory
from tqdm import tqdm          # for progress bar

How the game looks (without our agent):

In [2]:
env = gym.make('CartPole-v1', render_mode='human')
for _ in tqdm(range(10)):
    state, _ = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        next_state, reward, done, _, _ = env.step(action)
env.close()

100%|██████████| 10/10 [00:04<00:00,  2.10it/s]


## DQN Algorithm
-------------
We train a policy that tries to maximize the discounted,
cumulative reward
$R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t$, where
$R_{t_0}$ is *return*. The discount, $\gamma$ is the discount, between $0$ and $1$


Q-learning tries to find a function
$Q^*: State \times Action \rightarrow \mathbb{R}$, maximizes rewards:

\begin{align}\pi^*(s) = \arg\!\max_a \ Q^*(s, a)\end{align}

However, we don't know $Q^*$. So, we use neural network as a approximators, we can simply create one and train it to resemble $Q^*$.

For our training update rule, we'll use a fact that every $Q$
function for some policy obeys the Bellman equation:

\begin{align}Q^{\pi}(s, a) = r + \gamma Q^{\pi}(s', \pi(s'))\end{align}

The difference between the two sides of the equality is known as the temporal difference error, $\delta$:

\begin{align}\delta = Q(s, a) - (r + \gamma \max_a Q(s', a))\end{align}

### Model
---
Make a deep learning based policy model, that takes in a state and outputs an action.
This model will be an attribute of the Agent we make next.

In [3]:
# class Model(nn.Module):
#     def __init__(self, observation_size, action_size):
#         super(Model, self).__init__()
#         # initialise layers here
#
#     def forward(self, x):
#         # send x through the network
#         return x


### DQN Agent
----
We will be using experience replay memory for training our model.
An Agent's memory is as important as its model, and will be another attribute of our agent.
Other appropriate attributes are the hyperparameters (gamma, lr, etc.).
Give the agent a replay method that trains on a batch from its memory.


In [4]:
# class Agent:
#     def __init__(self, observation_size, action_size):
#         self.observation_size=observation_size
#         self.action_size = action_size
#         self.criterion = nn.MSELoss()
#         self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
#         self.model = Model(observation_size, action_size)
#         self.memory = None # memory that stores N most new transitions
#         # good place to store hyperparameters as attributes
#
#     def remember(self, state, action, reward, next_state, done):
#         # add to memory
#
#     def act(self, state):
#         # return an action from the model
#
#     def replay(self, batch_size):
#         # update model based on replay memory
#         # you might want to make a self.train() helper method


### Main Training loop
---
Make a function that takes and environment and an agent, and runs through $n$ episodes.
Remember to call that agent's replay function to learn from its past (once it has a past).


In [5]:
# def train(env, agent, episodes=1000, batch_size=64):  # train for many games
#     for _ in tqdm(range(episodes)):
#         state, _ = env.reset()
#         done = False
#         while not done:
#             # 1. make a move in game.
#             # 2. have the agent remember stuff.
#             # 3. update state
#             # 4. if we have enough experiences in out memory, learn from a batch with replay.
#             if len(agent.memory) >= batch_size:
#                 agent.replay(batch_size)
#     env.close()

### Putting it together
---
We train an agent on the environment:


In [6]:
# env = gym.make('CartPole-v1')  # , render_mode='human')
# agent = Agent(env.observation_space.shape[0], env.action_space.n)
# train(env, agent)
# # torch.save(agent.model.state_dict(), 'model.pth')

### My own test

In [2]:
#Model
class Model(nn.Module):
    def __init__(self, observation_space, action_space):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(observation_space, 128)
        self.fc2 = nn.Linear(128,128)
        self.fc3 = nn.Linear(128, action_space)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [3]:
#agent
from collections import deque
import numpy as np
import random

class Agent:
    def __init__(self, epsilon=1.0, epsilon_decay=0.99, epsilon_min=0.1, gamma=0.9, 
                learning_rate=0.001):#, memory_size, batch_size):
        self.model = Model(observation_space=4, action_space=2)
        device = torch.device('cpu')
        self.model = self.model.to(device)
        self.memory = deque(maxlen=2000)
        self.epsilon = epsilon #initial probability of taking random action - exploration|exploitation
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.gamma = gamma 
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.criterion = nn.MSELoss()
    
    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(0,2) #either left or right
        else:
            return torch.argmax(self.model(torch.tensor(state))).item()
            #return torch.random(self.model(state)) - don't need epsilon then - not deep QLearning but policy something...

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        #if len(self.memory) < batch_size:
        #    return
        batch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in batch:
            target = torch.tensor(reward)
            if not done:
                target = reward + self.gamma * torch.max(self.model(torch.tensor(next_state)))
            prediction = self.model(torch.tensor(state))[action]

            #target_f = self.model(torch.tensor(state))
            #target_f[action] = target
            #self.model.fit(torch.tensor(state), target_f)
            self.optimizer.zero_grad()
            #loss = self.criterion(target_f, torch.tensor(target_f))
            #loss = self.criterion(prediction.view(-1, 1), target.view(-1, 1))
            loss = self.criterion(prediction, target)
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [4]:
#train
def train(env, agent, batch_size=64, n_games=10000):
    for _ in tqdm(range(n_games)):
        state, _ = env.reset()
        done = False
        while not done:
            #action = env.action_space.sample()
            action = agent.choose_action(state)
            next_state, reward, done, truncated, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done or truncated)
            state = next_state
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
            #agent.replay()
    env.close()

In [5]:
def main():
    env = gym.make('CartPole-v1', render_mode='human')
    agent = Agent()
    train(env, agent)
main()

 13%|█▎        | 1329/10000 [15:56<1:20:03,  1.81it/s]

## Optional (highly recommended): Atari
Adapt your agent to play an Atari game of your choice.
https://www.gymlibrary.dev/environments/atari/air_raid/