In [1]:
from obstacle_tower_env import ObstacleTowerEnv

import time as systime
import numpy as np
from collections import deque

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from Source.Agents import Random_Agent
from Source.Utils  import Tracker, Converter

In [3]:
env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False, realtime_mode=False)

INFO:mlagents_envs:
'ObstacleTower-v2.2' started successfully!
Unity Academy name: ObstacleTower-v2.2
        Number of Brains: 1
        Number of Training Brains : 1
        Reset Parameters :
		starting-floor -> 0.0
		visual-theme -> 1.0
		allowed-rooms -> 2.0
		default-theme -> 0.0
		allowed-floors -> 2.0
		agent-perspective -> 1.0
		lighting-type -> 1.0
		dense-reward -> 1.0
		allowed-modules -> 2.0
		tower-seed -> -1.0
		total-floors -> 100.0
Unity brain name: LearningBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): [3, 3, 2, 3]
        Vector Action descriptions: Movement Forward/Back, Camera, Jump, Movement Left/Right
INFO:gym_unity:1 agents within environment.


In [4]:
action_size = np.prod(env.action_space.nvec.tolist()).item()
state_size  = list(env.observation_space[0].shape)

agent   = Random_Agent(state_size=state_size, action_size=action_size, device=device) 

In [5]:
UPDATE_EVERY = 4
DISPLAY_EVERY = 100
EPOCHS = 1000

tracker = Tracker(DISPLAY_EVERY)

In [6]:
training_clock = systime.time()

for epoch in range(EPOCHS):
    env_info = env.reset()
    
    # extract data from environment
    state = Converter.ProcessState(env_info[0])
    initial_time, curr_level = env_info[2], env_info[3]

    # define parameters
    last_time, time, done = initial_time, initial_time, False
    acc_reward = 0 
    
    # Play a level
    count = 0
    agent.step_begin()
    while not done and time > 0:
        # Act
        action = agent.act(state=state)
        env_info = env.step(Converter.OneHot2Action(action[0]))
        next_state = Converter.ProcessState(env_info[0][0])
        level, time, done = env_info[-1]['current_floor'], env_info[-1]['time_remaining'], env_info[2]
        
        # Check if the player cleared a level
        if level > curr_level:
            reward, curr_level, initial_time = 1 + max(0,last_time) / initial_time, level, time
        else:
            reward = -1 / initial_time
        acc_reward += reward
            
        # check if the simulation is done   
        if done:
            #buffer.add(state=state, action=action, reward=-1, next_state=next_state, done=done)
            break
        else:
            #buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
            state, last_time = next_state, time
            count += 1
            
            if count % UPDATE_EVERY == 0:
                agent.step_update()
    
    tracker.add(reward=acc_reward, level=curr_level)
    tracker.save_levels(agent.get_folder())
    agent.save()
    
    # Display step info
    if (epoch+1) % DISPLAY_EVERY == 0:
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=systime.time()-training_clock, end="\n")
        training_clock = systime.time()
    else:
        estimated_clock = (systime.time()-training_clock) * (DISPLAY_EVERY / ( (epoch+1) % DISPLAY_EVERY) - 1)
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=estimated_clock, end="\r")
        
    agent.step_end()

[100/1000 | 765.49s] Mean: -0.0644 | Max: 1.5260 | Mean Lvl: 0.1100 | Max Lvl: 1.0.                     
[200/1000 | 771.76s] Mean: -0.0752 | Max: 1.5920 | Mean Lvl: 0.1050 | Max Lvl: 1.0.                     
[300/1000 | 773.74s] Mean: -0.0254 | Max: 1.6260 | Mean Lvl: 0.1167 | Max Lvl: 1.0.                     
[400/1000 | 779.31s] Mean: -0.0541 | Max: 1.4240 | Mean Lvl: 0.1175 | Max Lvl: 1.0.                     
[500/1000 | 779.72s] Mean: -0.0170 | Max: 1.3700 | Mean Lvl: 0.1240 | Max Lvl: 1.0.                     
[600/1000 | 779.98s] Mean: -0.0399 | Max: 1.4560 | Mean Lvl: 0.1250 | Max Lvl: 1.0.                     
[700/1000 | 785.29s] Mean: -0.0004 | Max: 1.5420 | Mean Lvl: 0.1314 | Max Lvl: 1.0.                     
[800/1000 | 774.72s] Mean: -0.0544 | Max: 1.5960 | Mean Lvl: 0.1275 | Max Lvl: 1.0.                     
[900/1000 | 786.92s] Mean: 0.0605 | Max: 1.4880 | Mean Lvl: 0.1378 | Max Lvl: 1.0.                      
[1000/1000 | 768.15s] Mean: -0.0810 | Max: 1.5280 | Mea

In [7]:
torch.cuda.empty_cache()
env.close()