<a href="https://colab.research.google.com/github/bamtak/Reinforcement-learning-with-tensorflow/blob/master/Copy_of_Practical_Session_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Colab setup

In [None]:
# !pip install gym > /dev/null 2>&1

In [2]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [3]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1

Collecting setuptools
[?25l  Downloading https://files.pythonhosted.org/packages/8e/11/9e10f1cad4518cb307b484c255cae61e97f05b82f6d536932b1714e01b47/setuptools-49.2.0-py3-none-any.whl (789kB)
[K     |████████████████████████████████| 798kB 2.8MB/s 
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[?25hInstalling collected packages: setuptools
  Found existing installation: setuptools 49.1.0
    Uninstalling setuptools-49.1.0:
      Successfully uninstalled setuptools-49.1.0
Successfully installed setuptools-49.2.0


# Deep Q-Learning (DQN)


In DQN, the $Q$-function is parameterized by a neural network of parameters $\theta$. The network takes as input a state $s$ and outputs $Q(s, a, \theta)$ for all actions $a$. 

The network is trained in way that is similar to Fitted Q Iteration. At each time $T$, the agent has observed the transitions $(s_t, a_t, r_t, s_t')_{t=1}^T$, which are stored in a __replay buffer__.

In addition to the network with parameters $\theta$, DQN keeps another network with the same architecture and parameters $\tilde{\theta}$, called __target network__. 
To update the parameters $\theta$, we sample $N$ transitions from the __replay buffer__, we define the loss 

$$
L(\theta) = \sum_{i=1}^N [Q(s_i, a_i, \theta) - (r_i + \gamma\max_{a'}Q(s'_i,a', \tilde{\theta}))]^2
$$

and update 

$$
\theta \gets \theta + \eta \nabla L(\theta).
$$


Every $C$ iterations, the target network is updated as $\tilde{\theta} \gets \theta$. 

At each time $t$, DQN updates the networks as described above, selects an action according to an $\epsilon$-greedy policy, plays the action and stores the new data in the replay buffer.

In [1]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy

import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from IPython.display import clear_output
from pathlib import Path

import random, os.path, math, glob, csv, base64, itertools, sys
from pprint import pprint

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import io
from IPython.display import HTML

## Step 1: Define the parameters

In [2]:
# Environment
env = gym.make("CartPole-v0")

# Discount factor
GAMMA = 0.99

# Batch size
BATCH_SIZE = 256
# Capacity of the replay buffer
BUFFER_CAPACITY = 10000
# Update target net every ... episodes
UPDATE_TARGET_EVERY = 20

# Initial value of epsilon
EPSILON_START = 1.0
# Parameter to decrease epsilon
DECREASE_EPSILON = 200
# Minimum value of epislon
EPSILON_MIN = 0.05

# Number of training episodes
N_EPISODES = 2000

# Learning rate
LEARNING_RATE = 0.1

## Step 2: Define the replay buffer

In [3]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, next_state):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [4]:
# create instance of replay buffer
replay_buffer = ReplayBuffer(BUFFER_CAPACITY)

## Step 3: Define the neural network architecture, objective and optimizer

In [5]:
class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

In [6]:
# create network and target network
hidden_size = 128
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

q_net = Net(obs_size, hidden_size, n_actions)
target_net = Net(obs_size, hidden_size, n_actions)

# objective and optimizer
objective = nn.MSELoss()
optimizer = optim.Adam(params=q_net.parameters(), lr=LEARNING_RATE)

## Step 4: Implement DQN

In [7]:
#
#  Some useful functions
#

def get_q(states):
    """
    Compute Q function for a list of states
    """
    with torch.no_grad():
        states_v = torch.FloatTensor([states])
        output = q_net.forward(states_v).data.numpy()  # shape (1, len(states), n_actions)
    return output[0, :, :]  # shape (len(states), n_actions)

def eval_dqn(n_sim=5):
    """
    Monte Carlo evaluation of DQN agent
    """
    done = False
    rewards = np.zeros(n_sim)
    copy_env = deepcopy(env)
    for i in range(n_sim):
      state = copy_env.reset()
      while not done:
        action = get_q([state])[0].argmax()
        state, reward, done, _ = copy_env.step(action)
        rewards[i] += reward
    return rewards

In [8]:
def choose_action(state, epsilon):
    """
    TO BE IMPLEMENTED
    Return action according to an epsilon-greedy exploration policy
    """
    not_greedy = np.random.uniform(0, 1) < epsilon
    return env.action_space.sample() if not_greedy else get_q([state])[0].argmax()
    

def update(state, action, reward, next_state, done):
    """
    TO BE COMPLETED
    """
    
    # add data to replay buffer
    if done:
        next_state = None
    replay_buffer.push(state, action, reward, next_state)
    
    if len(replay_buffer) < BATCH_SIZE:
        return np.inf
    
    # get batch
    transitions = replay_buffer.sample(BATCH_SIZE)
    def map_tensor(idx): 
      return [transitions[i][idx] for i in range(BATCH_SIZE)]
    # Compute loss - TO BE IMPLEMENTED!

    batch_states = torch.FloatTensor(map_tensor(0))
    batch_rewards = torch.FloatTensor(map_tensor(2))
    batch_actions = torch.LongTensor(map_tensor(1))
    non_final_next_states = torch.FloatTensor([transitions[i][3] for i in range(BATCH_SIZE) if transitions[i][3] is not None])

    non_final_mask = torch.tensor([(transitions[i][3] is not None) for i in range(BATCH_SIZE)], dtype=bool)
    next_state_values = torch.zeros(BATCH_SIZE)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    targets = torch.zeros(BATCH_SIZE)   # to be computed using batch
    
    values = state_action_values = q_net(batch_states).gather(1, batch_actions.view(-1, 1))
    targets = batch_rewards + (GAMMA * next_state_values)
    loss = objective(values, targets.unsqueeze(1))
    
    # Optimize the model - UNCOMMENT!
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.data.numpy()

In [9]:
#
# Train
# 

EVAL_EVERY = 50
REWARD_THRESHOLD = 199

def train():
    state = env.reset()
    epsilon = EPSILON_START
    ep = 0
    total_time = 0
    while ep < N_EPISODES:
        action = choose_action(state, epsilon)

        # take action and update replay buffer and networks
        next_state, reward, done, _ = env.step(action)
        loss = update(state, action, reward, next_state, done)

        # update state
        state = next_state

        # end episode if done
        if done:
            state = env.reset()
            ep   += 1
            if ( (ep+1)% EVAL_EVERY == 0):
                rewards = eval_dqn()
                print("episode =", ep+1, ", reward = ", np.mean(rewards), 'Loss = ', loss)
                if np.mean(rewards) >= REWARD_THRESHOLD:
                    break

            # update target network
            if ep % UPDATE_TARGET_EVERY == 0:
                target_net.load_state_dict(q_net.state_dict())
            # decrease epsilon
            epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN) * \
                            np.exp(-1. * ep / DECREASE_EPSILON )    

        total_time += 1

train()
rewards = eval_dqn(20)
print("")
print("mean reward after training = ", np.mean(rewards))

episode = 50 , reward =  2.0 Loss =  0.026085978
episode = 100 , reward =  5.4 Loss =  0.04613961
episode = 150 , reward =  5.2 Loss =  0.07348943
episode = 200 , reward =  15.2 Loss =  0.2793681
episode = 250 , reward =  20.6 Loss =  0.1878831
episode = 300 , reward =  29.6 Loss =  0.48468223
episode = 350 , reward =  13.4 Loss =  0.3715018
episode = 400 , reward =  8.6 Loss =  4.9523187
episode = 450 , reward =  18.4 Loss =  0.3970398
episode = 500 , reward =  5.4 Loss =  3.5037093
episode = 550 , reward =  19.2 Loss =  1.5867462
episode = 600 , reward =  18.6 Loss =  2.7709312
episode = 650 , reward =  18.0 Loss =  13.584453
episode = 700 , reward =  2.2 Loss =  8.054456
episode = 750 , reward =  18.6 Loss =  6.5318966
episode = 800 , reward =  17.2 Loss =  1.5152545
episode = 850 , reward =  3.6 Loss =  5.916047
episode = 900 , reward =  1.8 Loss =  7.8619833
episode = 950 , reward =  14.8 Loss =  7.6232195
episode = 1000 , reward =  17.6 Loss =  4.2334223
episode = 1050 , reward =

## Visualizing the agent

In [10]:
def show_video(directory):
    html = []
    for mp4 in Path(directory).glob("*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))
    
def make_seed(seed):
    np.random.seed(seed=seed)
    torch.manual_seed(seed=seed)
  
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7fe39c1b66a0>

In [11]:
env = Monitor(env, "./gym-results", force=True, video_callable=lambda episode: True)
for episode in range(1):
    done = False
    state = env.reset()
    while not done:
        action = env.action_space.sample() # MODIFY THIS PART TO COMPUTE THE ACTION WITH DQN
        state, reward, done, info = env.step(action)
env.close()
show_video("./gym-results")