In [1]:
from obstacle_tower_env import ObstacleTowerEnv

import time as systime
import numpy as np
from collections import deque

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from Source.Agents import DQN_Agent
from Source.Buffer import Buffer
from Source.Utils  import Tracker, Converter

In [3]:
UPDATE_EVERY = 4
DISPLAY_EVERY = 100
EPOCHS = 5000
PREFIX = ""
SUFFIX = "Model2"

tracker = Tracker(DISPLAY_EVERY)

----

In [4]:
env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False, realtime_mode=False)

INFO:mlagents_envs:
'ObstacleTower-v2.2' started successfully!
Unity Academy name: ObstacleTower-v2.2
        Number of Brains: 1
        Number of Training Brains : 1
        Reset Parameters :
		starting-floor -> 0.0
		visual-theme -> 1.0
		allowed-rooms -> 2.0
		default-theme -> 0.0
		allowed-floors -> 2.0
		agent-perspective -> 1.0
		lighting-type -> 1.0
		dense-reward -> 1.0
		allowed-modules -> 2.0
		tower-seed -> -1.0
		total-floors -> 100.0
Unity brain name: LearningBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): [3, 3, 2, 3]
        Vector Action descriptions: Movement Forward/Back, Camera, Jump, Movement Left/Right
INFO:gym_unity:1 agents within environment.


In [5]:
action_size = np.prod(env.action_space.nvec.tolist()).item()
state_size  = list(env.observation_space[0].shape)

agent   = DQN_Agent(state_size=state_size, action_size=action_size, model_name="DQN_2",
                    device=device, n_iter=1, learning_rate=2e-4) 
buffer  = Buffer(buffer_size = 1e4, batch_size = 64)

In [6]:
training_clock = systime.time()

for epoch in range(EPOCHS):
    env_info = env.reset()
    
    # extract data from environment
    state = Converter.ProcessState(env_info[0])
    initial_time, curr_level = env_info[2], env_info[3]

    # define parameters
    last_time, time, done = initial_time, initial_time, False
    acc_reward = 0 
    
    # Play a level
    count = 0
    agent.step_begin()
    while not done and time > 0:
        # Act
        action = agent.act(state=state)
        env_info = env.step(Converter.OneHot2Action(action[0]))
        next_state = Converter.ProcessState(env_info[0][0])
        level, time, done = env_info[-1]['current_floor'], env_info[-1]['time_remaining'], env_info[2]
        
        # Check if the player cleared a level
        if level > curr_level:
            reward, curr_level, initial_time = 1 + max(0,last_time) / initial_time, level, time
        else:
            reward = -1 / initial_time
        acc_reward += reward
            
        # check if the simulation is done   
        if done:
            buffer.add(state=state, action=action, reward=-1, next_state=next_state, done=done)
            break
        else:
            buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
            state, last_time = next_state, time
            count += 1
            
            if count % UPDATE_EVERY == 0:
                agent.step_update()
    
    tracker.add(reward=acc_reward, level=curr_level)
    tracker.save_levels(agent.get_folder(prefix=PREFIX, suffix=SUFFIX))
    agent.save(prefix=PREFIX, suffix=SUFFIX)
    
    # Display step info
    if (epoch+1) % DISPLAY_EVERY == 0:
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=systime.time()-training_clock, end="\n")
        training_clock = systime.time()
    else:
        estimated_clock = (systime.time()-training_clock) * (DISPLAY_EVERY / ( (epoch+1) % DISPLAY_EVERY) - 1)
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=estimated_clock, end="\r")
        
    agent.step_end()

[100/5000 | 780.45s] Mean: -0.0096 | Max: 1.5117 | Mean Lvl: 0.1600 | Max Lvl: 1.0.                     
[200/5000 | 800.85s] Mean: 0.0234 | Max: 1.4720 | Mean Lvl: 0.1650 | Max Lvl: 1.0.                      
[300/5000 | 793.29s] Mean: -0.0511 | Max: 1.4540 | Mean Lvl: 0.1500 | Max Lvl: 1.0.                     
[400/5000 | 806.34s] Mean: 0.0093 | Max: 1.4340 | Mean Lvl: 0.1550 | Max Lvl: 1.0.                      
[500/5000 | 838.15s] Mean: 0.0147 | Max: 1.5660 | Mean Lvl: 0.1560 | Max Lvl: 1.0.                     
[600/5000 | 856.77s] Mean: 0.0211 | Max: 1.6280 | Mean Lvl: 0.1583 | Max Lvl: 1.0.                      
[700/5000 | 870.23s] Mean: -0.0188 | Max: 1.4080 | Mean Lvl: 0.1571 | Max Lvl: 1.0.                     
[800/5000 | 889.16s] Mean: 0.1047 | Max: 1.5220 | Mean Lvl: 0.1663 | Max Lvl: 1.0.                      
[900/5000 | 906.18s] Mean: 0.0260 | Max: 1.5040 | Mean Lvl: 0.1667 | Max Lvl: 1.0.                      
[1000/5000 | 902.13s] Mean: -0.0835 | Max: 1.4300 | Mean

In [7]:
torch.cuda.empty_cache()
env.close()