In [1]:
from obstacle_tower_env import ObstacleTowerEnv

import time as systime
import numpy as np
from collections import deque

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from Source.Agents import SingleAction_Agent
from Source.Utils  import Tracker, Converter

In [3]:
env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False, realtime_mode=False)

INFO:mlagents_envs:
'ObstacleTower-v2.2' started successfully!
Unity Academy name: ObstacleTower-v2.2
        Number of Brains: 1
        Number of Training Brains : 1
        Reset Parameters :
		starting-floor -> 0.0
		visual-theme -> 1.0
		allowed-rooms -> 2.0
		default-theme -> 0.0
		allowed-floors -> 2.0
		agent-perspective -> 1.0
		lighting-type -> 1.0
		dense-reward -> 1.0
		allowed-modules -> 2.0
		tower-seed -> -1.0
		total-floors -> 100.0
Unity brain name: LearningBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): [3, 3, 2, 3]
        Vector Action descriptions: Movement Forward/Back, Camera, Jump, Movement Left/Right
INFO:gym_unity:1 agents within environment.


In [4]:
action_size = np.prod(env.action_space.nvec.tolist()).item()
state_size  = list(env.observation_space[0].shape)

agent   = SingleAction_Agent(state_size=state_size, action_size=action_size, action=[1,0,0,0], device=device) 

In [5]:
UPDATE_EVERY = 4
DISPLAY_EVERY = 100
EPOCHS = 1000

tracker = Tracker(DISPLAY_EVERY)

In [6]:
training_clock = systime.time()

for epoch in range(EPOCHS):
    env_info = env.reset()
    
    # extract data from environment
    state = Converter.ProcessState(env_info[0])
    initial_time, curr_level = env_info[2], env_info[3]

    # define parameters
    last_time, time, done = initial_time, initial_time, False
    acc_reward = 0 
    
    # Play a level
    count = 0
    agent.step_begin()
    while not done and time > 0:
        # Act
        action = agent.act(state=state)
        env_info = env.step(action)
        next_state = Converter.ProcessState(env_info[0][0])
        level, time, done = env_info[-1]['current_floor'], env_info[-1]['time_remaining'], env_info[2]
        
        # Check if the player cleared a level
        if level > curr_level:
            reward, curr_level, initial_time = 1 + max(0,last_time) / initial_time, level, time
        else:
            reward = -1 / initial_time
        acc_reward += reward
            
        # check if the simulation is done   
        if done:
            break
        else:
            state, last_time = next_state, time
            count += 1
            
            if count % UPDATE_EVERY == 0:
                agent.step_update()
    
    tracker.add(reward=acc_reward, level=curr_level)
    tracker.save_levels(agent.get_folder())
    agent.save()
    
    # Display step info
    if (epoch+1) % DISPLAY_EVERY == 0:
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=systime.time()-training_clock, end="\n")
        training_clock = systime.time()
    else:
        estimated_clock = (systime.time()-training_clock) * (DISPLAY_EVERY / ( (epoch+1) % DISPLAY_EVERY) - 1)
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=estimated_clock, end="\r")
        
    agent.step_end()

[100/1000 | 824.95s] Mean: 0.6319 | Max: 3.6626 | Mean Lvl: 0.4300 | Max Lvl: 2.0.                     
[200/1000 | 973.40s] Mean: 0.6695 | Max: 3.6626 | Mean Lvl: 0.4500 | Max Lvl: 2.0.                     
[300/1000 | 1127.17s] Mean: 0.5912 | Max: 3.6626 | Mean Lvl: 0.4100 | Max Lvl: 2.0.                     
[400/1000 | 1156.59s] Mean: 0.5148 | Max: 3.6642 | Mean Lvl: 0.3700 | Max Lvl: 2.0.                     
[500/1000 | 1161.11s] Mean: 0.3800 | Max: 3.6642 | Mean Lvl: 0.3000 | Max Lvl: 2.0.                     
[600/1000 | 983.22s] Mean: 0.6317 | Max: 3.6642 | Mean Lvl: 0.4300 | Max Lvl: 2.0.                      
[700/1000 | 932.83s] Mean: 0.4373 | Max: 3.6605 | Mean Lvl: 0.3300 | Max Lvl: 2.0.                      
[800/1000 | 911.70s] Mean: 0.5348 | Max: 3.6642 | Mean Lvl: 0.3800 | Max Lvl: 2.0.                     
[900/1000 | 978.69s] Mean: 0.9020 | Max: 3.6626 | Mean Lvl: 0.5700 | Max Lvl: 2.0.                     
[1000/1000 | 872.15s] Mean: 0.4758 | Max: 3.6626 | Mean Lvl

In [7]:
torch.cuda.empty_cache()
env.close()