In [1]:
import torch.nn as nn
from model import DQN
from trainer import DQNTrainer
from exploration import EpsilonGreedyExploration, quadratic_decay_schedule
from buffer import ReplayBuffer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env_name = "ALE/MontezumaRevenge-v5"
max_steps = 1000000
initial_epsilon = 1.0
batch_size = 4

params = dict(
    env_name=env_name, 
    Model=DQN,
    model_params=dict(
        conv_feats=256,
        hidden_dim = 128,
        n_layers = 12,
        Activation = nn.ReLU,
        Norm = nn.LayerNorm,
    ),
    exploration=EpsilonGreedyExploration(
        epsilon=initial_epsilon,
        decay_schedule=quadratic_decay_schedule(
            initial_epsilon=initial_epsilon,
            final_epsilon=0.3,
            max_step=max_steps,
        )
    ), 
    Buffer=ReplayBuffer, 
    buffer_params=dict(
        batch_size=batch_size,
        min_size=320,
        max_size=max_steps // 50,
    ),
    discount_rate=0.9,
    loss_fn=torch.nn.SmoothL1Loss(beta=1.0),
    Optim=torch.optim.RMSprop,
    lr=1e-5,
    time_step_reward=-3e-3,
    network_frozen_steps=1000,
    seed = 42,
    max_steps = max_steps,
    debug = False
)

trainer = DQNTrainer(**params)

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [3]:
trainer.fit()

Training:   0%|          | 0/1000000 [00:00<?, ?step/s]

  if not isinstance(terminated, (bool, np.bool8)):
Training:   2%|▏         | 24694/1000000 [16:42<11:14:14, 24.11step/s, value_loss=4.92e-15]