In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from collections import deque

from ignite.engine import Engine, Events
from torch.utils.tensorboard import SummaryWriter
import datetime

In [3]:
from dqn_agent import Agent

seed = 0
timesteps = list(range(10000))

env = gym.make('LunarLander-v2')
env.seed(seed)
torch.manual_seed(seed)

agent = Agent(state_size=8, action_size=4, seed=0)

gamma = 0.99
eps_start = 1.
eps_end = 0.01
eps_decay = 0.995

now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
writer = SummaryWriter("logs/ddqn/{}".format(now))

In [4]:
EPISODE_STARTED = Events.EPOCH_STARTED
EPISODE_COMPLETED = Events.EPOCH_COMPLETED

def run_single_timestep(engine, timestep):
    eps = engine.state.eps
    state = engine.state.current_state
    action = agent.act(state, eps)
    next_state, reward, done, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    
    engine.state.current_state = next_state
    engine.state.score += reward
    
    if done:
        engine.terminate_epoch()
        engine.state.timestep = timestep

trainer = Engine(run_single_timestep)

@trainer.on(Events.STARTED)
def initialize(engine):
    # lists containing scores from each episode
    engine.state.scores = []                        
    engine.state.scores_window = deque(maxlen=100)
    engine.state.eps = eps_start


@trainer.on(EPISODE_STARTED)
def reset_environment_state(engine):
    engine.state.current_state = env.reset()
    engine.state.score = 0

@trainer.on(EPISODE_COMPLETED)
def update_model(engine):
    engine.state.eps = max(eps_end, eps_decay*engine.state.eps) # decrease epsilon
    
    score = engine.state.score
    engine.state.scores.append(score)
    engine.state.scores_window.append(score)
    
@trainer.on(EPISODE_COMPLETED(every=10))
def log_episode_to_tensorboard(engine):
    i = engine.state.epoch
#     writer.add_scalar('running reward', engine.state.running_reward, i_episode)
    writer.add_scalar('Average episode score', np.mean(engine.state.scores_window), i)
    writer.add_scalar('Average environment score', np.mean(engine.state.scores), i)

@trainer.on(EPISODE_COMPLETED)
def should_finish_training(engine):
    if np.mean(engine.state.scores_window)>=200.0:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(engine.state.epoch, np.mean(engine.state.scores_window)))
        torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        engine.should_terminate = True

In [5]:
trainer.run(timesteps, max_epochs=10000)


Environment solved in 663 episodes!	Average Score: 200.32


State:
	iteration: 377906
	epoch: 663
	epoch_length: 10000
	max_epochs: 10000
	output: <class 'NoneType'>
	batch: 371
	metrics: <class 'dict'>
	dataloader: <class 'list'>
	seed: <class 'NoneType'>
	times: <class 'dict'>
	scores: <class 'list'>
	scores_window: <class 'collections.deque'>
	eps: 0.036033175291307735
	current_state: <class 'numpy.ndarray'>
	score: 242.1907799722907
	timestep: 371