In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from collections import deque

from ignite.engine import Engine, Events
from torch.utils.tensorboard import SummaryWriter
import datetime

from unityagents import UnityEnvironment

In [None]:
from dqn_agent import Agent

seed = 0
timesteps = list(range(10000))

# env = gym.make('LunarLander-v2')
# env.seed(seed)
env = UnityEnvironment(file_name="../Banana_Windows_x86_64/Banana.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

action_size = brain.vector_action_space_size
env_info = env.reset(train_mode=True)[brain_name]
state_size = len(env_info.vector_observations[0])

torch.manual_seed(seed)

agent = Agent(state_size=state_size, action_size=action_size, seed=0)

gamma = 0.99
eps_start = 1.
eps_end = 0.01
eps_decay = 0.995

now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
writer = SummaryWriter("logs/unity/{}".format(now))

In [None]:
EPISODE_STARTED = Events.EPOCH_STARTED
EPISODE_COMPLETED = Events.EPOCH_COMPLETED

def run_single_timestep(engine, timestep):
    eps = engine.state.eps
    state = engine.state.current_state
    #  (np.int32 because https://github.com/xkiwilabs/DQN-using-PyTorch-and-ML-Agents/issues/2)
    action = agent.act(state, eps).astype(np.int32)
    
#     next_state, reward, done, _ = env.step(action)
    env_info = env.step(action)[brain_name]
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]
    
    agent.step(state, action, reward, next_state, done)
    
    engine.state.current_state = next_state
    engine.state.score += reward
    
    if done:
        engine.terminate_epoch()
        engine.state.timestep = timestep

trainer = Engine(run_single_timestep)

@trainer.on(Events.STARTED)
def initialize(engine):
    # lists containing scores from each episode
    engine.state.scores = []                        
    engine.state.scores_window = deque(maxlen=100)
    engine.state.eps = eps_start


@trainer.on(EPISODE_STARTED)
def reset_environment_state(engine):
#     engine.state.current_state = env.reset()
    env_info = env.reset(train_mode=False)[brain_name]
    engine.state.current_state = env_info.vector_observations[0]
    engine.state.score = 0

@trainer.on(EPISODE_COMPLETED)
def update_model(engine):
    engine.state.eps = max(eps_end, eps_decay*engine.state.eps) # decrease epsilon
    
    score = engine.state.score
    engine.state.scores.append(score)
    engine.state.scores_window.append(score)
    
@trainer.on(EPISODE_COMPLETED(every=10))
def log_episode_to_tensorboard(engine):
    i = engine.state.epoch
#     writer.add_scalar('running reward', engine.state.running_reward, i_episode)
    writer.add_scalar('Average episode score', np.mean(engine.state.scores_window), i)
    writer.add_scalar('Average environment score', np.mean(engine.state.scores), i)

@trainer.on(EPISODE_COMPLETED)
def should_finish_training(engine):
    if np.mean(engine.state.scores_window)>=13.0:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(engine.state.epoch, np.mean(engine.state.scores_window)))
        torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        engine.should_terminate = True

In [None]:
trainer.run(timesteps, max_epochs=10000)