# Homework 2

In this homework, you will be implementing Deep Q Networks and Asynchronous Advantage Actor-Critic models and use them in two of the Atari games and simple gym environments. You will need to fill the missing parts in the modules and then run your tests in the notebook.

## 1) Deep Q Networks (50)

As seen in the class, DQN has two main features, namely target networks and replay buffer. However, additional improvements have been introduced since the first release of DQN.

- [Prioritized Replay Buffer](https://arxiv.org/pdf/1509.06461.pdf)
- [Double Deep Q Networks](https://arxiv.org/pdf/1511.05952.pdf)
- [Dueling Deep Q Networks](https://arxiv.org/pdf/1511.06581.pdf)

After implementing these methods, you can compare combined algorithm with the vanilla DQN. For comparison you will be using two [gym](https://gym.openai.com/) environments. 
- [Lunar Lander](LunarLander-v2)
- [Pong](https://gym.openai.com/envs/Pong-v0/)



In [None]:
# In order to use modified modules without restarting
%load_ext autoreload
%autoreload 2

In [None]:
import torch; 
from gym.version import VERSION
print(torch.__version__)
print(VERSION)
%load_ext line_profiler

In [None]:
import gym
import numpy as np
import torch

from blg604ehw2.dqn import episodic_test
from blg604ehw2.dqn import episodic_train

from blg604ehw2.network import Network
from blg604ehw2.network import FcNet
from blg604ehw2.network import DuelingHead
from blg604ehw2.network import SimpleHead

from blg604ehw2.utils import comparison
from blg604ehw2.utils import LoadingBar

### Lunar Lander with DQN (15)

In [None]:
def traindqn(env, agent, args, test_rate=1):
    bar = LoadingBar(args.episode, "Episode")
    #agent.to(args.device)
    agent.device = args.device
    epsilons = np.linspace(args.max_epsilon , args.min_epsilon, num=args.episode)
    time_step = 0
    rewards = []
    td_errors = []

    best_model = None
    best_rewards = []
    best_reward = -np.inf
    #import pdb;pdb.set_trace()
    for eps in range(args.episode):
        ### YOUR CODE HERE ###
        time_step, td_error = episodic_train(env, agent, args, epsilons[eps])
        if eps % test_rate == 0:
            reward = 0
            for it in range(test_rate):
                reward += episodic_test(env, agent, args)
            reward /= test_rate
            best_reward = reward if best_reward < reward else best_reward
        best_rewards.append(best_reward)
        td_errors.append(td_error)
        rewards.append(reward)
        best_model = agent if best_reward == reward else best_model
        
        ###       END      ###
        bar.progress(eps, best_reward)
        
    bar.success(best_reward)
    return rewards, best_rewards, td_errors, time_step, best_model

### a) Vanilla DQN

In [None]:
from blg604ehw2.dqn import DQN
from blg604ehw2.dqn import ArgsDQN

# Set the environment name, you can try differnet environments.
envname = "LunarLander-v2"

# Hyperparameters for the traning
# You need to finetune some of the  hyperparameters!
env = gym.make(envname)
dqn_args = ArgsDQN(
        **dict(
            env_name=envname,           # Name of the environment
            nstates=env.observation_space.shape,  
            nact=env.action_space.n,    # Number of actions
            buffersize=20000,           # Size of the replay buffer
            max_epsilon=0.9,            # Starting value of the epsilon
            min_epsilon=0.1,            # Convergence value of the epsilon
            target_update_period=50,    # Update period of the target network
            gamma=0.99,                 # Discount rate
            lr=0.001,                   # Learning rate
            device="cuda",              # Device name
            batch_size=128,             # Batch size
            episode=10,             # Number of episodes for training
            max_eps_len=600          # Maximum number of time steps in an episode
        )
    )

def dqn_agent():
    # Network construction
    feature_net = FcNet(env.observation_space.shape[0], 128)
    head_net = SimpleHead(env.action_space.n, 128)
    valuenet = Network(feature_net, head_net)
    
    # Initialize and return agent
    return DQN(
        dev = dqn_args.device,
        valuenet = valuenet,
        nact = env.action_space.n,
        lr = dqn_args.lr,
        buffer_capacity = dqn_args.buffersize,
        target_update_period = dqn_args.target_update_period
        )

In [None]:
REPEAT = 1
lunar_dqn = []
#with torch.autograd.profiler.profile(use_cuda=True) as prof:
for r in range(REPEAT):
    env = gym.make(dqn_args.env_name)
    agent = dqn_agent()
    lunar_dqn.append(traindqn(env, agent, dqn_args, test_rate=5))

In [None]:
prof.table(sort_by="cuda_time_total")
prof.key_averages()
prof.export_chrome_trace("/home/cbekar/Desktop/trace")

In [None]:
    plot_texts = [
    [
        "Episodic Reward",
        "episode",
        "reward"
    ],
    [
        "Episodic Best Reward",
        "episode",
        "reward"
    ],
    [
        "Td Error",
        "episode",
        "td"
    ]
]

In [None]:
comparison((lunar_dqn, "DQN"), texts = plot_texts)

In [None]:
# Remember that due to the stochasticty of the
# environment it may perform differently for each run

# Assuming ddpdqn works better in your environment as it should be
#import pdb;pdb.set_trace()
best_agent_index = max(range(len(lunar_dqn)), key = lambda i: lunar_dqn[i][1][-1])
best_agent_state_dict = lunar_dqn[best_agent_index][4]
best_agent = dqn_agent()
########!best_agent.load_state_dict(best_agent_state_dict)

# Monitor saves the mp4 files under "monitor" folder.
monitor_path = "LunarLander/DQN/" + str(dqn_args.episode) + " episode"
model_path = "monitor/LunarLander/model_state_dict"
episodic_test(agent=lunar_dqn[best_agent_index][4], env=gym.make(dqn_args.env_name), args=dqn_args, monitor_path=monitor_path)
torch.save(best_agent_state_dict, model_path)

In [None]:
monitor_path = "LunarLander/DQN/" + str(dqn_args.episode) + " episode"
model_path = "model_state_dict"
best_agent = torch.load(model_path)
episodic_test(agent=best_agent, env=gym.make(dqn_args.env_name), args=dqn_args, monitor_path=monitor_path)

In [None]:
env.close()

### b) Dueling Double Prioritized DQN (15)

In [None]:
from blg604ehw2.dqn import DuelingDoublePrioritizedDQN
from blg604ehw2.dqn import ArgsDDPQN


# Hyperparameters for the traning
# You need to finetune some of the  hyperparameters!
envname = "LunarLander-v2"
env = gym.make(envname)
ddpdqn_args = ArgsDDPQN(
        **dict(
            env_name=envname,           # Name of the environment
            nstates=env.observation_space.shape,
            nact=env.action_space.n,    # Number of actions
            buffersize=20000,           # Size of the replay buffer
            max_epsilon=0.9,            # Starting value of the epsilon
            min_epsilon=0.1,            # Convergence value of the epsilon
            target_replace_period=50,   # Update period of the target network
            gamma=0.99,                 # Discount rate
            lr=0.001,                   # Learning rate
            device="cuda",               # Device name
            batch_size=128,             # Batch size
            episode=10,                 # Number of episodes for training
            max_eps_len=400             # Maximum number of time steps in an episode
        )
    )

def ddpdqn_agent():
    # Network construction
    feature_net = FcNet(env.observation_space.shape[0], 128)
    head_net = DuelingHead(env.action_space.n, 128)
    valuenet = Network(feature_net, head_net)

    # Initialize agent
    return DuelingDoublePrioritizedDQN(
        dev = ddpdqn_args.device,
        valuenet = valuenet,
        nact = env.action_space.n,
        lr = ddpdqn_args.lr,
        buffer_capacity = ddpdqn_args.buffersize,
        target_replace_period = ddpdqn_args.target_replace_period,
        gamma = ddpdqn_args.gamma
    )

In [None]:
REPEAT = 1
lunar_ddpdqn = []
for r in range(REPEAT):
    env = gym.make(ddpdqn_args.env_name)
    agent = ddpdqn_agent()
    lunar_ddpdqn.append(traindqn(env, agent, ddpdqn_args, test_rate=5))

In [None]:
comparison((lunar_ddpdqn, "DDPDQN"), (lunar_dqn, "DQN"), texts = plot_texts)

You may compare these improvements by themselves if you want to. See which one of them makes the most improvement for the Lunar Lander environment.(optional)

#### Let's visualize the best agent

In [None]:
# Remember that due to the stochasticty of the
# environment it may perform differently for each run

# Assuming ddpdqn works better in your environment as it should be
best_agent_index = max(range(len(lunar_ddpdqn)), key = lambda i: lunar_ddpdqn[i][1][-1])
best_agent_state_dict = lunar_ddpdqn[best_agent_index][4]
best_agent = ddpdqn_agent()
best_agent.load_state_dict(best_agent_state_dict)

# Monitor saves the mp4 files under "monitor" folder.
monitor_path = "LunarLander/" + str(ddpdqn_args.episode) + " episode"
model_path = "monitor/LunarLander/model_state_dict"
episodic_test(agent=best_agent, env=gym.make(ddpdqn_args.env_name), args=ddpdqn_args, monitor_path=monitor_path)
torch.save(best_agent_state_dict, model_path)

### Pong with Dueling Double Prioritized DQN (20)
- This may take long time

In [None]:
from blg604ehw2.atari_wrapper import ClipRewardEnv
from blg604ehw2.atari_wrapper import FrameStack
from blg604ehw2.atari_wrapper import EpisodicLifeEnv
from blg604ehw2.atari_wrapper import WarpFrame
from blg604ehw2.atari_wrapper import ScaledFloatFrame

from blg604ehw2.network import Cnn

envname = "PongNoFrameskip-v4" # Should be without frameskips

# Wrapped atari environment.
# It is important to use these wrappers in order
# to simplfy learning. In their Nature paper,
# Deepmind used some of them to achive those
# results. It is good to check them and see 
# what do they do.
def pongenv():
    env = gym.make(envname)
    env = ClipRewardEnv(env)            # Clip the reward between -1 and 1
    env = WarpFrame(env)                # Downsample rgb (210, 160, 3) images to gray images (84, 84)
    env = EpisodicLifeEnv(env)          # Terminate the environment after a live is lost
    env = FrameStack(env, k=4)          # Stack consecutive frames as a single state
    return env

# Hyperparameters for the traning
# This time parameter tunning is even more important!
# If you have access to a gpu use it! Set the device accordingly.
env = pongenv()
pong_args = ArgsDDPQN(
        **dict(
            env_name=envname,           # Name of the environment
            nstates=env.observation_space,
            nact=env.action_space.n,    # Number of actions
            buffersize=1000,          # Size of the replay buffer
            max_epsilon=0.9,            # Starting value of the epsilon
            min_epsilon=0.1,            # Convergence value of the epsilon
            target_replace_period=100,  # Update period of the target network
            gamma=0.97,                 # Discount rate
            lr=0.0002,                  # Learning rate
            device="cuda",              # Device name
            batch_size=128,             # Batch size
            episode=10,               # Number of episodes for training
            max_eps_len=100            # Maximum number of time steps in an episode
        )
    )

def pong_agent():
    # Network construction
    feature_net = Cnn(4, 512)
    head_net = DuelingHead(env.action_space.n, 512)
    valuenet = Network(feature_net, head_net)

    # Initialize agent
    return DuelingDoublePrioritizedDQN(
        dev = pong_args.device,
        valuenet = valuenet,
        nact = env.action_space.n,
        lr = pong_args.lr,
        buffer_capacity = pong_args.buffersize,
        target_replace_period = pong_args.target_replace_period
    )

In [None]:
REPEAT = 1 # Assign 1 if you dont want to train more than one
pong_ddpdqn = []
for r in range(REPEAT):
    env = pongenv()
    agent = pong_agent()
    pong_ddpdqn.append(traindqn(env, agent, pong_args, test_rate=3))

In [None]:
comparison((pong_ddpdqn, "PONG"), texts = plot_texts)

In [None]:
best_agent_index = max(range(len(pong_ddpdqn)), key = lambda i: pong_ddpdqn[i][1][-1])
best_agent_state_dict = pong_ddpdqn[best_agent_index][4]
best_agent = pong_agent() 
#best_agent.load_state_dict(best_agent_state_dict)

# Monitor saves the mp4 files under "monitor" folder.
monitor_path = "Pong/" + str(pong_args.episode) + " episode"
model_path = "monitor/Pong/model_state_dict"
episodic_test(agent=pong_ddpdqn[best_agent_index][4], env=pongenv(), args=pong_args, monitor_path=monitor_path)
torch.save(best_agent_state_dict, model_path)

## 2) Asynchronous Advantage Actor-Critic (50)
[A3C](https://arxiv.org/abs/1602.01783) is a policy gradinet algorithm which is based on asynchronous updates of paralel agents.
You will be testing your agent in:

- [Bipedal Walker](https://gym.openai.com/envs/BipedalWalker-v2/)
- [Breakout](https://gym.openai.com/envs/Breakout-v0/)

### a) Bipedal Walker with Asynchronous Advantage Actor-Critic (20)
It is important to test your implementation with a simpler enviroment like BipedalWalker before trying Breakout.
It is highly recommended to check the pseudocode in the paper's appendix.

The implementation works as follows:

    - Create a global agent which's paremeters are in the shared memory.
    - Create multiple worker processes. That performs:
        - Gradient calculation with the transition it observed
        - Update the global agent with the gradients
        - Synchronize with the global agent
    - Create a test process that evaluates the performance of the global agent over the course of the training
    - Run these workers asynchronously
    
   

In [None]:
# In order to use modified modules without restarting
%load_ext autoreload
%autoreload 2

In [None]:
import gym
from collections import namedtuple
import torch.multiprocessing as mp 

from blg604ehw2.network import FcNet
from blg604ehw2.network import ContinuousDistHead
from blg604ehw2.network import Network

from blg604ehw2.a3c import ContinuousA3c
from blg604ehw2.a3c import SharedAdam
from blg604ehw2.a3c import train_worker
from blg604ehw2.a3c import test_worker
from blg604ehw2.a3c import A3C_args


# Bipedal Walker environment is similar to Lunar Lander
# State space is a vector of length 24 and there are
# 4 actions
envname = "BipedalWalker-v2"

# Logger is a named tuple of shared lists integer and a model
# It is necessary to have a shared object since it can be used
# by many processes
Logger = namedtuple("Logger", "eps_reward best_reward best_model time_steps time")

# Hyperparameters, again tunning is necessary but optional.
a3c_args = A3C_args(
    **dict(
        maxtimestep=100000,     # Number of time steps for training
        maxlen=600,             # Maximum length of an episode
        nstep=20,               # Bootsrapping length (n-step td)
        gamma=0.98,             # Discount rate
        lr=0.0001,              # Learning rate
        beta=0.01,              # Entropy regularization constant
        device="cpu",           # Device
    )
)

# Agent generating function
def a3c_agent():
    feature_net = FcNet(24)
    head_net = ContinuousDistHead(128, 4)
    network = Network(feature_net, head_net)
    agent = ContinuousA3c(network)
    agent.device = a3c_args.device
    return agent

# Environment generating function
# You can use RewardClip wrapper
def walker_env():
    env =  gym.make(envname)
    return env
    

In [None]:
### Main cell for Bipedal Walker ###

# Number of training workers
N_PROCESSES = mp.cpu_count()

# Global agent that will be used for synchronization.
global_agent = a3c_agent()
global_agent.share_memory()         # Make sure it is in the shared memory!

# Shared optimizer, since the optimizer has its own parameters
# they need to be in the shared memory as well.
sharedopt = SharedAdam(global_agent.parameters(), lr=a3c_args.lr)

# Another agent for logging purposes
best_agent = a3c_agent()
best_agent.share_memory()

# Logger
# Manager controls another process(server process) to share
# objects between multiple processes via proxies.
# Please read https://docs.python.org/3.7/library/multiprocessing.html
# for more information.
manager = mp.Manager()
logger = Logger(
    manager.list(),
    manager.list(),
    best_agent,
    manager.list(),
    manager.Value("i", 0)
)
logger.time_steps.append(0)
for t in range(a3c_args.maxtimestep):
    logger.eps_reward.append(None)
    logger.best_reward.append(None)
    
# Lock is not necessary
lock = mp.Lock()

# Start by creating a test worker
processes = []
process = mp.Process(target=test_worker,
                     args=(a3c_args, global_agent, walker_env, a3c_agent, lock, logger,\
                           None,False))
# test_worker(a3c_args, global_agent, walker_env, a3c_agent, N_PROCESSES, logger)
# train_worker(a3c_args, global_agent, sharedopt, walker_env, a3c_agent, 0, N_PROCESSES, logger)
process.start()
processes.append(process)

# # Train workers
for t in range(N_PROCESSES):
    process = mp.Process(target=train_worker,
                         args=(a3c_args, global_agent, sharedopt, walker_env, a3c_agent, t, N_PROCESSES, logger))
    process.start()
    processes.append(process)
    
# Wait until all done
for p in processes:
    p.join()

In [None]:
# Plot the rewards from the logger
import matplotlib.pyplot as plt
plt.title("A3C Walker Best Rewards")
plt.ylabel("reward")
plt.xlabel("time steps")
plt.plot(list(logger.time_steps), list(logger.best_reward))

### b) Breakout with Asynchronous Advantage Actor-Critic (30)

- It takes a lot of time so be prepared.

In [1]:
# In order to use modified modules without restarting
%load_ext autoreload
%autoreload 2

In [2]:
import gym
import torch
from blg604ehw2.atari_wrapper import ClipRewardEnv
from blg604ehw2.atari_wrapper import FrameStack
from blg604ehw2.atari_wrapper import EpisodicLifeEnv
from blg604ehw2.atari_wrapper import WarpFrame
from blg604ehw2.atari_wrapper import ScaledFloatFrame

from blg604ehw2.network import Cnn
from blg604ehw2.network import DiscreteDistHead
from collections import namedtuple
import torch.multiprocessing as mp 
from blg604ehw2.a3c import DiscreteA3c
from blg604ehw2.a3c import A3C_args
from blg604ehw2.network import Network
from blg604ehw2.a3c import SharedAdam
from blg604ehw2.a3c import train_worker
from blg604ehw2.a3c import test_worker
# Breakout Environment
envname = "Breakout-v4"
Logger = namedtuple("Logger", "eps_reward best_reward best_model time_steps time")

# Hyperparameters
breakout_args = A3C_args(
    **dict(
        maxtimestep=40000000,
        maxlen=2000,
        nstep=20,
        gamma=0.98,
        lr=0.00003,
        beta=0.01,
        device="cpu",
    )
)

def breakout_agent():
    feature_net = Cnn(4, 512) # 4 channel size because of the StackFrame buffer
    head_net = DiscreteDistHead(512, 4) # 4 output because of the environment's action space
    network = Network(feature_net, head_net)
    agent = DiscreteA3c(network)
    agent.device = breakout_args.device
    return agent

def breakout_env():
    env = gym.make(envname)
    env = ClipRewardEnv(env)            # Clip the reward between -1 and 1
    env = WarpFrame(env)                # Downsample rgb (210, 160, 3) images to gray images (84, 84)
    env = EpisodicLifeEnv(env)          # Terminate the environment after a live is lost
    env = FrameStack(env, k=4)          # Stack consecutive frames as a single state
    return env


In [6]:
### Main cell for Breakout ###

N_PROCESSES = mp.cpu_count()

global_agent = breakout_agent()
global_agent.share_memory()
sharedopt = SharedAdam(global_agent.parameters(), lr=breakout_args.lr)

best_agent = breakout_agent()
best_agent.share_memory()

# Try to use one manager
manager = mp.Manager()
logger = Logger(
    manager.list(),
    manager.list(),
    best_agent,
    manager.list(),
    manager.Value("i", 0)
)
logger.time_steps.append(0)
for t in range(N_PROCESSES):
    logger.eps_reward.append(None)
    logger.best_reward.append(None)
lock = mp.Lock()

processes = []

process = mp.Process(target=test_worker,
                     args=(breakout_args, global_agent, breakout_env, breakout_agent, lock, logger))
# train_worker(breakout_args, global_agent, sharedopt, breakout_env, breakout_agent, 0, N_PROCESSES, logger)
# test_worker(breakout_args, global_agent, breakout_env, breakout_agent, N_PROCESSES, logger)
process.start()
processes.append(process)
for t in range(N_PROCESSES):
    process = mp.Process(target=train_worker,
                         args=(breakout_args, global_agent, sharedopt, breakout_env, breakout_agent, t, N_PROCESSES, logger))
    process.start()
    processes.append(process)
for p in processes:
    p.join()

VBox(children=(HTML(value=''), IntProgress(value=0, max=40000000)))

> /home/cbekar/Dropbox/Academic/PhD/Semesters/Spring19/RL/DRL/HWs/BLG604E-HW2-homework/DRL_HW2/blg604ehw2/a3c/train.py(116)test_worker()
-> state = env.reset()
(Pdb) c
train steps 0 test steps 1 max reward 0.0
train steps 0 test steps 2 max reward 0.0
train steps 0 test steps 3 max reward 0.0
train steps 0 test steps 4 max reward 0.0
train steps 0 test steps 5 max reward 0.0
train steps 0 test steps 6 max reward 0.0
train steps 0 test steps 7 max reward 0.0
train steps 0 test steps 8 max reward 0.0
train steps 0 test steps 9 max reward 0.0
train steps 0 test steps 10 max reward 0.0
train steps 0 test steps 11 max reward 0.0
train steps 0 test steps 12 max reward 0.0
train steps 0 test steps 13 max reward 0.0
train steps 0 test steps 14 max reward 0.0
train steps 0 test steps 15 max reward 0.0
train steps 0 test steps 16 max reward 0.0
train steps 0 test steps 17 max reward 0.0
train steps 0 test steps 18 max reward 0.0
train steps 0 test steps 19 max reward 0.0
train steps 0 test steps

train steps 0 test steps 191 max reward 0.0
train steps 0 test steps 192 max reward 0.0
train steps 0 test steps 193 max reward 0.0
train steps 0 test steps 194 max reward 0.0
train steps 0 test steps 195 max reward 0.0
train steps 0 test steps 196 max reward 0.0
train steps 0 test steps 197 max reward 0.0
train steps 0 test steps 198 max reward 0.0
train steps 0 test steps 199 max reward 0.0
train steps 0 test steps 200 max reward 0.0
train steps 0 test steps 201 max reward 0.0
train steps 0 test steps 202 max reward 0.0
train steps 0 test steps 203 max reward 0.0
train steps 0 test steps 204 max reward 0.0
train steps 0 test steps 205 max reward 0.0
train steps 0 test steps 206 max reward 0.0
train steps 0 test steps 207 max reward 0.0
train steps 0 test steps 208 max reward 0.0
train steps 0 test steps 209 max reward 0.0
train steps 0 test steps 210 max reward 0.0
train steps 0 test steps 211 max reward 0.0
train steps 0 test steps 212 max reward 0.0
train steps 0 test steps 213 max

train steps 0 test steps 380 max reward 0.0
train steps 0 test steps 381 max reward 0.0
train steps 0 test steps 382 max reward 0.0
train steps 0 test steps 383 max reward 0.0
train steps 0 test steps 384 max reward 0.0
train steps 0 test steps 385 max reward 0.0
train steps 0 test steps 386 max reward 0.0
train steps 0 test steps 387 max reward 0.0
train steps 0 test steps 388 max reward 0.0
train steps 0 test steps 389 max reward 0.0
train steps 0 test steps 390 max reward 0.0
train steps 0 test steps 391 max reward 0.0
train steps 0 test steps 392 max reward 0.0
train steps 0 test steps 393 max reward 0.0
train steps 0 test steps 394 max reward 0.0
train steps 0 test steps 395 max reward 0.0
train steps 0 test steps 396 max reward 0.0
train steps 0 test steps 397 max reward 0.0
train steps 0 test steps 398 max reward 0.0
train steps 0 test steps 399 max reward 0.0
train steps 0 test steps 400 max reward 0.0
train steps 0 test steps 401 max reward 0.0
train steps 0 test steps 402 max

train steps 0 test steps 591 max reward 0.0
train steps 0 test steps 592 max reward 0.0
train steps 0 test steps 593 max reward 0.0
train steps 0 test steps 594 max reward 0.0
train steps 0 test steps 595 max reward 0.0
train steps 0 test steps 596 max reward 0.0
train steps 0 test steps 597 max reward 0.0
train steps 0 test steps 598 max reward 0.0
train steps 0 test steps 599 max reward 0.0
train steps 0 test steps 600 max reward 0.0
train steps 0 test steps 601 max reward 0.0
train steps 0 test steps 602 max reward 0.0
train steps 0 test steps 603 max reward 0.0
train steps 0 test steps 604 max reward 0.0
train steps 0 test steps 605 max reward 0.0
train steps 0 test steps 606 max reward 0.0
train steps 0 test steps 607 max reward 0.0
train steps 0 test steps 608 max reward 0.0
train steps 0 test steps 609 max reward 0.0
train steps 0 test steps 610 max reward 0.0
train steps 0 test steps 611 max reward 0.0
train steps 0 test steps 612 max reward 0.0
train steps 0 test steps 613 max

train steps 0 test steps 798 max reward 1.0
train steps 0 test steps 799 max reward 1.0
train steps 0 test steps 800 max reward 1.0
train steps 0 test steps 801 max reward 1.0
train steps 0 test steps 802 max reward 1.0
train steps 0 test steps 803 max reward 1.0
train steps 0 test steps 804 max reward 1.0
train steps 0 test steps 805 max reward 1.0
train steps 0 test steps 806 max reward 1.0
train steps 0 test steps 807 max reward 1.0
train steps 0 test steps 808 max reward 1.0
train steps 0 test steps 809 max reward 1.0
train steps 0 test steps 810 max reward 1.0
train steps 0 test steps 811 max reward 1.0
train steps 0 test steps 812 max reward 1.0
train steps 0 test steps 813 max reward 1.0
train steps 0 test steps 814 max reward 1.0
train steps 0 test steps 815 max reward 1.0
train steps 0 test steps 816 max reward 1.0
train steps 0 test steps 817 max reward 1.0
train steps 0 test steps 818 max reward 1.0
train steps 0 test steps 819 max reward 1.0
train steps 0 test steps 820 max

train steps 0 test steps 986 max reward 1.0
train steps 0 test steps 987 max reward 1.0
train steps 0 test steps 988 max reward 1.0
train steps 0 test steps 989 max reward 1.0
train steps 0 test steps 990 max reward 1.0
train steps 0 test steps 991 max reward 1.0
train steps 0 test steps 992 max reward 1.0
train steps 0 test steps 993 max reward 1.0
train steps 0 test steps 994 max reward 1.0
train steps 0 test steps 995 max reward 1.0
train steps 0 test steps 996 max reward 1.0
train steps 0 test steps 997 max reward 1.0
train steps 0 test steps 998 max reward 1.0
train steps 0 test steps 999 max reward 1.0
train steps 0 test steps 1000 max reward 1.0
train steps 0 test steps 1001 max reward 1.0
train steps 0 test steps 1002 max reward 1.0
train steps 0 test steps 1003 max reward 1.0
train steps 0 test steps 1004 max reward 1.0
train steps 0 test steps 1005 max reward 1.0
train steps 0 test steps 1006 max reward 1.0
train steps 0 test steps 1007 max reward 1.0
train steps 0 test steps

train steps 0 test steps 1171 max reward 1.0
train steps 0 test steps 1172 max reward 1.0
train steps 0 test steps 1173 max reward 1.0
train steps 0 test steps 1174 max reward 1.0
train steps 0 test steps 1175 max reward 1.0
train steps 0 test steps 1176 max reward 1.0
train steps 0 test steps 1177 max reward 1.0
train steps 0 test steps 1178 max reward 1.0
train steps 0 test steps 1179 max reward 1.0
train steps 0 test steps 1180 max reward 1.0
train steps 0 test steps 1181 max reward 1.0
train steps 0 test steps 1182 max reward 1.0
train steps 0 test steps 1183 max reward 1.0
train steps 0 test steps 1184 max reward 1.0
train steps 0 test steps 1185 max reward 1.0
train steps 0 test steps 1186 max reward 1.0
train steps 0 test steps 1187 max reward 1.0
train steps 0 test steps 1188 max reward 1.0
train steps 0 test steps 1189 max reward 1.0
train steps 0 test steps 1190 max reward 1.0
train steps 0 test steps 1191 max reward 1.0
train steps 0 test steps 1192 max reward 1.0
train step

train steps 0 test steps 1382 max reward 1.0
train steps 0 test steps 1383 max reward 1.0
train steps 0 test steps 1384 max reward 1.0
train steps 0 test steps 1385 max reward 1.0
train steps 0 test steps 1386 max reward 1.0
train steps 0 test steps 1387 max reward 1.0
train steps 0 test steps 1388 max reward 1.0
train steps 0 test steps 1389 max reward 1.0
train steps 0 test steps 1390 max reward 1.0
train steps 0 test steps 1391 max reward 1.0
train steps 0 test steps 1392 max reward 1.0
train steps 0 test steps 1393 max reward 1.0
train steps 0 test steps 1394 max reward 1.0
train steps 0 test steps 1395 max reward 1.0
train steps 0 test steps 1396 max reward 1.0
train steps 0 test steps 1397 max reward 1.0
train steps 0 test steps 1398 max reward 1.0
train steps 0 test steps 1399 max reward 1.0
train steps 0 test steps 1400 max reward 1.0
train steps 0 test steps 1401 max reward 1.0
train steps 0 test steps 1402 max reward 1.0
train steps 0 test steps 1403 max reward 1.0
train step

train steps 0 test steps 1570 max reward 1.0
train steps 0 test steps 1571 max reward 1.0
train steps 0 test steps 1572 max reward 1.0
train steps 0 test steps 1573 max reward 1.0
train steps 0 test steps 1574 max reward 1.0
train steps 0 test steps 1575 max reward 1.0
train steps 0 test steps 1576 max reward 1.0
train steps 0 test steps 1577 max reward 1.0
train steps 0 test steps 1578 max reward 1.0
train steps 0 test steps 1579 max reward 1.0
train steps 0 test steps 1580 max reward 1.0
train steps 0 test steps 1581 max reward 1.0
train steps 0 test steps 1582 max reward 1.0
train steps 0 test steps 1583 max reward 1.0
train steps 0 test steps 1584 max reward 1.0
train steps 0 test steps 1585 max reward 1.0
train steps 0 test steps 1586 max reward 1.0
train steps 0 test steps 1587 max reward 1.0
train steps 0 test steps 1588 max reward 1.0
train steps 0 test steps 1589 max reward 1.0
train steps 0 test steps 1590 max reward 1.0
train steps 0 test steps 1591 max reward 1.0
train step

train steps 0 test steps 1758 max reward 1.0
train steps 0 test steps 1759 max reward 1.0
train steps 0 test steps 1760 max reward 1.0
train steps 0 test steps 1761 max reward 1.0
train steps 0 test steps 1762 max reward 1.0
train steps 0 test steps 1763 max reward 1.0
train steps 0 test steps 1764 max reward 1.0
train steps 0 test steps 1765 max reward 1.0
train steps 0 test steps 1766 max reward 1.0
train steps 0 test steps 1767 max reward 1.0
train steps 0 test steps 1768 max reward 1.0
train steps 0 test steps 1769 max reward 1.0
train steps 0 test steps 1770 max reward 1.0
train steps 0 test steps 1771 max reward 1.0
train steps 0 test steps 1772 max reward 1.0
train steps 0 test steps 1773 max reward 1.0
train steps 0 test steps 1774 max reward 1.0
train steps 0 test steps 1775 max reward 1.0
train steps 0 test steps 1776 max reward 1.0
train steps 0 test steps 1777 max reward 1.0
train steps 0 test steps 1778 max reward 1.0
train steps 0 test steps 1779 max reward 1.0
train step

train steps 0 test steps 1945 max reward 1.0
train steps 0 test steps 1946 max reward 1.0
train steps 0 test steps 1947 max reward 1.0
train steps 0 test steps 1948 max reward 1.0
train steps 0 test steps 1949 max reward 1.0
train steps 0 test steps 1950 max reward 1.0
train steps 0 test steps 1951 max reward 1.0
train steps 0 test steps 1952 max reward 1.0
train steps 0 test steps 1953 max reward 1.0
train steps 0 test steps 1954 max reward 1.0
train steps 0 test steps 1955 max reward 1.0
train steps 0 test steps 1956 max reward 1.0
train steps 0 test steps 1957 max reward 1.0
train steps 0 test steps 1958 max reward 1.0
train steps 0 test steps 1959 max reward 1.0
train steps 0 test steps 1960 max reward 1.0
train steps 0 test steps 1961 max reward 1.0
train steps 0 test steps 1962 max reward 1.0
train steps 0 test steps 1963 max reward 1.0
train steps 0 test steps 1964 max reward 1.0
train steps 0 test steps 1965 max reward 1.0
train steps 0 test steps 1966 max reward 1.0
train step

train steps 0 test steps 2133 max reward 1.0
train steps 0 test steps 2134 max reward 1.0
train steps 0 test steps 2135 max reward 1.0
train steps 0 test steps 2136 max reward 1.0
train steps 0 test steps 2137 max reward 1.0
train steps 0 test steps 2138 max reward 1.0
train steps 0 test steps 2139 max reward 1.0
train steps 0 test steps 2140 max reward 1.0
train steps 0 test steps 2141 max reward 1.0
train steps 0 test steps 2142 max reward 1.0
train steps 0 test steps 2143 max reward 1.0
train steps 0 test steps 2144 max reward 1.0
train steps 0 test steps 2145 max reward 1.0
train steps 0 test steps 2146 max reward 1.0
train steps 0 test steps 2147 max reward 1.0
train steps 0 test steps 2148 max reward 1.0
train steps 0 test steps 2149 max reward 1.0
train steps 0 test steps 2150 max reward 1.0
train steps 0 test steps 2151 max reward 1.0
train steps 0 test steps 2152 max reward 1.0
train steps 0 test steps 2153 max reward 1.0
train steps 0 test steps 2154 max reward 1.0
train step

train steps 0 test steps 2345 max reward 1.0
train steps 0 test steps 2346 max reward 1.0
train steps 0 test steps 2347 max reward 1.0
train steps 0 test steps 2348 max reward 1.0
train steps 0 test steps 2349 max reward 1.0
train steps 0 test steps 2350 max reward 1.0
train steps 0 test steps 2351 max reward 1.0
train steps 0 test steps 2352 max reward 1.0
train steps 0 test steps 2353 max reward 1.0
train steps 0 test steps 2354 max reward 1.0
train steps 0 test steps 2355 max reward 1.0
train steps 0 test steps 2356 max reward 1.0
train steps 0 test steps 2357 max reward 1.0
train steps 0 test steps 2358 max reward 1.0
train steps 0 test steps 2359 max reward 1.0
train steps 0 test steps 2360 max reward 1.0
train steps 0 test steps 2361 max reward 1.0
train steps 0 test steps 2362 max reward 1.0
train steps 0 test steps 2363 max reward 1.0
train steps 0 test steps 2364 max reward 1.0
train steps 0 test steps 2365 max reward 1.0
train steps 0 test steps 2366 max reward 1.0
train step

train steps 0 test steps 2528 max reward 1.0
train steps 0 test steps 2529 max reward 1.0
train steps 0 test steps 2530 max reward 1.0
train steps 0 test steps 2531 max reward 1.0
train steps 0 test steps 2532 max reward 1.0
train steps 0 test steps 2533 max reward 1.0
train steps 0 test steps 2534 max reward 1.0
train steps 0 test steps 2535 max reward 1.0
train steps 0 test steps 2536 max reward 1.0
train steps 0 test steps 2537 max reward 1.0
train steps 0 test steps 2538 max reward 1.0
train steps 0 test steps 2539 max reward 1.0
train steps 0 test steps 2540 max reward 1.0
train steps 0 test steps 2541 max reward 1.0
train steps 0 test steps 2542 max reward 1.0
train steps 0 test steps 2543 max reward 1.0
train steps 0 test steps 2544 max reward 1.0
train steps 0 test steps 2545 max reward 1.0
train steps 0 test steps 2546 max reward 1.0
train steps 0 test steps 2547 max reward 1.0
train steps 0 test steps 2548 max reward 1.0
train steps 0 test steps 2549 max reward 1.0
train step

train steps 0 test steps 2711 max reward 1.0
train steps 0 test steps 2712 max reward 1.0
train steps 0 test steps 2713 max reward 1.0
train steps 0 test steps 2714 max reward 1.0
train steps 0 test steps 2715 max reward 1.0
train steps 0 test steps 2716 max reward 1.0
train steps 0 test steps 2717 max reward 1.0
train steps 0 test steps 2718 max reward 1.0
train steps 0 test steps 2719 max reward 1.0
train steps 0 test steps 2720 max reward 1.0
train steps 0 test steps 2721 max reward 1.0
train steps 0 test steps 2722 max reward 1.0
train steps 0 test steps 2723 max reward 1.0
train steps 0 test steps 2724 max reward 1.0
train steps 0 test steps 2725 max reward 1.0
train steps 0 test steps 2726 max reward 1.0
train steps 0 test steps 2727 max reward 1.0
train steps 0 test steps 2728 max reward 1.0
train steps 0 test steps 2729 max reward 1.0
train steps 0 test steps 2730 max reward 1.0
train steps 0 test steps 2731 max reward 1.0
train steps 0 test steps 2732 max reward 1.0
train step

train steps 0 test steps 2921 max reward 1.0
train steps 0 test steps 2922 max reward 1.0
train steps 0 test steps 2923 max reward 1.0
train steps 0 test steps 2924 max reward 1.0
train steps 0 test steps 2925 max reward 1.0
train steps 0 test steps 2926 max reward 1.0
train steps 0 test steps 2927 max reward 1.0
train steps 0 test steps 2928 max reward 1.0
train steps 0 test steps 2929 max reward 1.0
train steps 0 test steps 2930 max reward 1.0
train steps 0 test steps 2931 max reward 1.0
train steps 0 test steps 2932 max reward 1.0
train steps 0 test steps 2933 max reward 1.0
train steps 0 test steps 2934 max reward 1.0
train steps 0 test steps 2935 max reward 1.0
train steps 0 test steps 2936 max reward 1.0
train steps 0 test steps 2937 max reward 1.0
train steps 0 test steps 2938 max reward 1.0
train steps 0 test steps 2939 max reward 1.0
train steps 0 test steps 2940 max reward 1.0
train steps 0 test steps 2941 max reward 1.0
train steps 0 test steps 2942 max reward 1.0
train step

train steps 0 test steps 3136 max reward 1.0
train steps 0 test steps 3137 max reward 1.0
train steps 0 test steps 3138 max reward 1.0
train steps 0 test steps 3139 max reward 1.0
train steps 0 test steps 3140 max reward 1.0
train steps 0 test steps 3141 max reward 1.0
train steps 0 test steps 3142 max reward 1.0
train steps 0 test steps 3143 max reward 1.0
train steps 0 test steps 3144 max reward 1.0
train steps 0 test steps 3145 max reward 1.0
train steps 0 test steps 3146 max reward 1.0
train steps 0 test steps 3147 max reward 1.0
train steps 0 test steps 3148 max reward 1.0
train steps 0 test steps 3149 max reward 1.0
train steps 0 test steps 3150 max reward 1.0
train steps 0 test steps 3151 max reward 1.0
train steps 0 test steps 3152 max reward 1.0
train steps 0 test steps 3153 max reward 1.0
train steps 0 test steps 3154 max reward 1.0
train steps 0 test steps 3155 max reward 1.0
train steps 0 test steps 3156 max reward 1.0
train steps 0 test steps 3157 max reward 1.0
train step

train steps 0 test steps 3319 max reward 2.0
train steps 0 test steps 3320 max reward 2.0
train steps 0 test steps 3321 max reward 2.0
train steps 0 test steps 3322 max reward 2.0
train steps 0 test steps 3323 max reward 2.0
train steps 0 test steps 3324 max reward 2.0
train steps 0 test steps 3325 max reward 2.0
train steps 0 test steps 3326 max reward 2.0
train steps 0 test steps 3327 max reward 2.0
train steps 0 test steps 3328 max reward 2.0
train steps 0 test steps 3329 max reward 2.0
train steps 0 test steps 3330 max reward 2.0
train steps 0 test steps 3331 max reward 2.0
train steps 0 test steps 3332 max reward 2.0
train steps 0 test steps 3333 max reward 2.0
train steps 0 test steps 3334 max reward 2.0
train steps 0 test steps 3335 max reward 2.0
train steps 0 test steps 3336 max reward 2.0
train steps 0 test steps 3337 max reward 2.0
train steps 0 test steps 3338 max reward 2.0
train steps 0 test steps 3339 max reward 2.0
train steps 0 test steps 3340 max reward 2.0
train step

train steps 0 test steps 3504 max reward 2.0
train steps 0 test steps 3505 max reward 2.0
train steps 0 test steps 3506 max reward 2.0
train steps 0 test steps 3507 max reward 2.0
train steps 0 test steps 3508 max reward 2.0
train steps 0 test steps 3509 max reward 2.0
train steps 0 test steps 3510 max reward 2.0
train steps 0 test steps 3511 max reward 2.0
train steps 0 test steps 3512 max reward 2.0
train steps 0 test steps 3513 max reward 2.0
train steps 0 test steps 3514 max reward 2.0
train steps 0 test steps 3515 max reward 2.0
train steps 0 test steps 3516 max reward 2.0
train steps 0 test steps 3517 max reward 2.0
train steps 0 test steps 3518 max reward 2.0
train steps 0 test steps 3519 max reward 2.0
train steps 0 test steps 3520 max reward 2.0
train steps 0 test steps 3521 max reward 2.0
train steps 0 test steps 3522 max reward 2.0
train steps 0 test steps 3523 max reward 2.0
train steps 0 test steps 3524 max reward 2.0
train steps 0 test steps 3525 max reward 2.0
train step

BdbQuit: 

In [None]:
# Plot the rewards from the logger
import matplotlib.pyplot as plt
plt.title("A3C Breakout Best Rewards")
plt.ylabel("reward")
plt.xlabel("time steps")
plt.plot(list(logger.time_steps), list(logger.best_reward))

In [None]:
# Save the best model's parameters
model_path = "monitor/Breakout/model_state_dict"
torch.save(logger.best_model.state_dict(), model_path)