In [1]:
from obstacle_tower_env import ObstacleTowerEnv

import time as systime
import numpy as np
from collections import deque

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from Source.Agents import PPO_Agent
from Source.Utils  import Tracker, Converter

In [3]:
UPDATE_EVERY = 4
DISPLAY_EVERY = 100
EPOCHS = 2500
PREFIX = "PPO"
SUFFIX = "_constant_discount_DDQN_RewardCut"
MODEL_NAME = "PPO"

tracker = Tracker(DISPLAY_EVERY)

----

In [15]:
env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False, realtime_mode=False)

INFO:mlagents_envs:
'ObstacleTower-v2.2' started successfully!
Unity Academy name: ObstacleTower-v2.2
        Number of Brains: 1
        Number of Training Brains : 1
        Reset Parameters :
		starting-floor -> 0.0
		visual-theme -> 1.0
		allowed-rooms -> 2.0
		default-theme -> 0.0
		allowed-floors -> 2.0
		agent-perspective -> 1.0
		lighting-type -> 1.0
		dense-reward -> 1.0
		allowed-modules -> 2.0
		tower-seed -> -1.0
		total-floors -> 100.0
Unity brain name: LearningBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): [3, 3, 2, 3]
        Vector Action descriptions: Movement Forward/Back, Camera, Jump, Movement Left/Right
INFO:gym_unity:1 agents within environment.


In [5]:
action_size = 6 #np.prod(env.action_space.nvec.tolist()).item()
state_size  = list(env.observation_space[0].shape)

agent   = PPO_Agent(state_size=state_size, action_size=action_size, model_name=MODEL_NAME, 
                    device=device, gradient_steps=16, learning_rate=2e-4, epsilon=0.2) 


tracker.load_levels(agent.get_folder(prefix=PREFIX, suffix=SUFFIX))
agent.load_weights(prefix=PREFIX, suffix=SUFFIX)

In [6]:
training_clock = systime.time()
stop_count = 0

for epoch in range(len(tracker.levels)):
    agent.step_begin()
    agent.step_end(None, None, None, None, None, train=False)

for epoch in range(len(tracker.levels), EPOCHS):
    env_info = env.reset()
    
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy   = 0
    
    # extract data from environment
    state = Converter.ProcessState(env_info[0])
    initial_time, curr_level = env_info[2], env_info[3]

    # define parameters
    last_time, time, done = initial_time, initial_time, False
    
    # Play a level
    count = 0
    agent.step_begin()
    
    while not done and time > 0:
        # Act
        distribution, value = agent.act(state=state)
        action = distribution.sample().cpu().numpy()
        action_id = action.argmax()
        onehot_action = [1 if x == action_id else 0 for x in range(action_size)]
        
        env_info = env.step(Converter.OneHot2Action6(onehot_action))
        next_state = Converter.ProcessState(env_info[0][0])
        level, time, done = env_info[-1]['current_floor'], env_info[-1]['time_remaining'], env_info[2]   
        
        log_prob = distribution.log_prob(torch.tensor(action, dtype=torch.float).to(device))
        entropy += distribution.entropy().mean().cpu()
        
        reward = -1 / initial_time
        if level > curr_level:
            reward = 1 + max(0,last_time) / initial_time
            curr_level, initial_time = level, time
        if done or time <= 0:
            reward = -1
            
        
        states.append(state)
        actions.append(action)
        log_probs.append(log_prob.cpu().detach().numpy())
        values.append(value.cpu().detach().numpy())
        rewards.append([reward])
        masks.append(1-done)
        
        # check if the simulation is done
        state, last_time = next_state, time
        count += 1

        if count % UPDATE_EVERY == 0:
            agent.step_update()
        
    
    tracker.add(reward=np.sum(rewards), level=curr_level)
    tracker.save_levels(agent.get_folder(prefix=PREFIX, suffix=SUFFIX))
    agent.save(prefix=PREFIX, suffix=SUFFIX)
    
    if curr_level == 0:
        stop_count += 1
    else:
        stop_count = 0
    if stop_count == 500:
        break
    
    # Display step info
    if (epoch+1) % DISPLAY_EVERY == 0:
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=systime.time()-training_clock, end="\n")
        training_clock = systime.time()
    else:
        estimated_clock = (systime.time()-training_clock) * (DISPLAY_EVERY / ( (epoch+1) % DISPLAY_EVERY) - 1)
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=estimated_clock, end="\r")
        
    _, next_value = agent.act(state)
    
    log_probs = np.array(log_probs)[:,0,:]
    states    = np.array(states)[:,0,:]
    actions   = np.array(actions)[:,0,:]
    rewards   = np.array(rewards)
    values    = np.array(values)[:,0,:]
    
    returns = PPO_Agent.normalize_rewards(next_value.cpu().detach().numpy(), values, rewards, masks)
    advantages = returns-values
        
    agent.step_end(log_probs, states, actions, np.array(returns), advantages)

[100/2500 | 1234.99s] Mean: -1.0161 | Max: 0.6103 | Mean Lvl: 0.1400 | Max Lvl: 1.0.                     
[200/2500 | 1172.56s] Mean: -1.0417 | Max: 1.1711 | Mean Lvl: 0.1200 | Max Lvl: 2.0.                     
[300/2500 | 1112.77s] Mean: -1.0960 | Max: 0.6803 | Mean Lvl: 0.0800 | Max Lvl: 1.0.                     
[400/2500 | 1181.21s] Mean: -0.7673 | Max: 2.4124 | Mean Lvl: 0.3000 | Max Lvl: 2.0.                     
[500/2500 | 1119.63s] Mean: -1.1248 | Max: 0.3963 | Mean Lvl: 0.0600 | Max Lvl: 1.0.                     
[600/2500 | 1097.64s] Mean: -1.0996 | Max: 0.3723 | Mean Lvl: 0.0800 | Max Lvl: 1.0.                     
[700/2500 | 1115.85s] Mean: -0.9747 | Max: 0.5443 | Mean Lvl: 0.1700 | Max Lvl: 1.0.                     
[800/2500 | 1146.73s] Mean: -0.7283 | Max: 1.3598 | Mean Lvl: 0.3100 | Max Lvl: 2.0.                     
[900/2500 | 1188.20s] Mean: -0.2377 | Max: 2.6628 | Mean Lvl: 0.5000 | Max Lvl: 2.0.                     
[1000/2500 | 1149.93s] Mean: -0.5485 | Max: 2.

In [None]:
torch.cuda.empty_cache()
env.close()

In [14]:
env.close()

In [17]:
for epoch in range(len(tracker.levels), 7000):
    env_info = env.reset()
    
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy   = 0
    
    # extract data from environment
    state = Converter.ProcessState(env_info[0])
    initial_time, curr_level = env_info[2], env_info[3]

    # define parameters
    last_time, time, done = initial_time, initial_time, False
    
    # Play a level
    count = 0
    agent.step_begin()
    
    while not done and time > 0:
        # Act
        distribution, value = agent.act(state=state)
        action = distribution.sample().cpu().numpy()
        action_id = action.argmax()
        onehot_action = [1 if x == action_id else 0 for x in range(action_size)]
        
        env_info = env.step(Converter.OneHot2Action6(onehot_action))
        next_state = Converter.ProcessState(env_info[0][0])
        level, time, done = env_info[-1]['current_floor'], env_info[-1]['time_remaining'], env_info[2]   
        
        log_prob = distribution.log_prob(torch.tensor(action, dtype=torch.float).to(device))
        entropy += distribution.entropy().mean().cpu()
        
        reward = -1 / initial_time
        if level > curr_level:
            reward = 1 + max(0,last_time) / initial_time
            curr_level, initial_time = level, time
        if done or time <= 0:
            reward = -1
            
        
        states.append(state)
        actions.append(action)
        log_probs.append(log_prob.cpu().detach().numpy())
        values.append(value.cpu().detach().numpy())
        rewards.append([reward])
        masks.append(1-done)
        
        # check if the simulation is done
        state, last_time = next_state, time
        count += 1

        if count % UPDATE_EVERY == 0:
            agent.step_update()
        
    
    tracker.add(reward=np.sum(rewards), level=curr_level)
    tracker.save_levels(agent.get_folder(prefix=PREFIX, suffix=SUFFIX))
    agent.save(prefix=PREFIX, suffix=SUFFIX)
    
    if curr_level == 0:
        stop_count += 1
    else:
        stop_count = 0
    if stop_count == 500:
        break
    
    # Display step info
    if (epoch+1) % DISPLAY_EVERY == 0:
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=systime.time()-training_clock, end="\n")
        training_clock = systime.time()
    else:
        estimated_clock = (systime.time()-training_clock) * (DISPLAY_EVERY / ( (epoch+1) % DISPLAY_EVERY) - 1)
        tracker.display(epoch=epoch, total_epochs=EPOCHS, clock=estimated_clock, end="\r")
        
    _, next_value = agent.act(state)
    
    log_probs = np.array(log_probs)[:,0,:]
    states    = np.array(states)[:,0,:]
    actions   = np.array(actions)[:,0,:]
    rewards   = np.array(rewards)
    values    = np.array(values)[:,0,:]
    
    returns = PPO_Agent.normalize_rewards(next_value.cpu().detach().numpy(), values, rewards, masks)
    advantages = returns-values
        
    agent.step_end(log_probs, states, actions, np.array(returns), advantages)

[6100/2500 | 2039.92s] Mean: -0.3195 | Max: 2.6391 | Mean Lvl: 0.5400 | Max Lvl: 2.0.                      
[6200/2500 | 1212.60s] Mean: -0.2473 | Max: 2.6472 | Mean Lvl: 0.5800 | Max Lvl: 2.0.                     
[6300/2500 | 1171.48s] Mean: -0.5982 | Max: 2.6571 | Mean Lvl: 0.3400 | Max Lvl: 2.0.                     
[6400/2500 | 1210.47s] Mean: -0.3012 | Max: 4.3826 | Mean Lvl: 0.5100 | Max Lvl: 3.0.                     
[6500/2500 | 1203.83s] Mean: -0.2851 | Max: 3.5492 | Mean Lvl: 0.5200 | Max Lvl: 3.0.                     
[6600/2500 | 1240.72s] Mean: -0.1326 | Max: 3.5107 | Mean Lvl: 0.6300 | Max Lvl: 3.0.                     
[6700/2500 | 1213.76s] Mean: -0.4949 | Max: 1.6542 | Mean Lvl: 0.5000 | Max Lvl: 2.0.                     
[6800/2500 | 1139.23s] Mean: -0.9237 | Max: 0.6803 | Mean Lvl: 0.2000 | Max Lvl: 1.0.                     
[6900/2500 | 1166.80s] Mean: -0.6662 | Max: 0.6783 | Mean Lvl: 0.3800 | Max Lvl: 1.0.                     
[7000/2500 | 1170.84s] Mean: -0.6756

In [11]:
env = ObstacleTowerEnv('./ObstacleTower/obstacletower', retro=False, realtime_mode=True)

INFO:mlagents_envs:
'ObstacleTower-v2.2' started successfully!
Unity Academy name: ObstacleTower-v2.2
        Number of Brains: 1
        Number of Training Brains : 1
        Reset Parameters :
		starting-floor -> 0.0
		visual-theme -> 1.0
		allowed-rooms -> 2.0
		default-theme -> 0.0
		allowed-floors -> 2.0
		agent-perspective -> 1.0
		lighting-type -> 1.0
		dense-reward -> 1.0
		allowed-modules -> 2.0
		tower-seed -> -1.0
		total-floors -> 100.0
Unity brain name: LearningBrain
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): [3, 3, 2, 3]
        Vector Action descriptions: Movement Forward/Back, Camera, Jump, Movement Left/Right
INFO:gym_unity:1 agents within environment.


In [13]:
env_info = env.reset()

state = Converter.ProcessState(env_info[0])
done, time = False, 1

while not done and time > 0:
    # Act
    distribution, value = agent.act(state=state)
    action = distribution.sample().cpu().numpy()
    action_id = action.argmax()
    onehot_action = [1 if x == action_id else 0 for x in range(action_size)]

    env_info = env.step(Converter.OneHot2Action6(onehot_action))
    state = Converter.ProcessState(env_info[0][0])
    level, time, done = env_info[-1]['current_floor'], env_info[-1]['time_remaining'], env_info[2]  