In [1]:
#!pip install pybullet

In [2]:
#import pybullet_envs

In [3]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

## Step 1: We initialize the Experience Replay memory

In [4]:
class ReplayBuffer(object):
    def __init__(self, max_size=1e6):
        # initialize an array to store transitions
        self.storage = []
    
        # initialize the max size array, this is will the maximum size of the relpay buffer
        self.max_size = max_size
    
        # initialize pointer which will be used to push the oldest entry out
        self.ptr = 0

    def add(self, transition):
    # check if the pointer buffer is full
        if len(self.storage) == self.max_size:
            # if the pointer buffer is full replace the oldest entry of transitions with the new one
            self.storage[int(self.ptr)] = transition

            # the below code will keep increasing the pointer as it will replace the oldest entries until
            # the maximum ptr value is attained and it will reset itself to zero and hence repeating the 
            # process of removing the older entry and entering the new one
            self.ptr = (self.ptr + 1) % self.max_size

        else:
            # if the buffer is not yet full, just append the new transition to the next available slot
            self.storage.append(transition)

    # this function will help get samples from the replay buffer to use when training the model
    def sample(self, batch_size):
        # ind will store random integers which will be from 0 to the used size of the replay buffer
        # it will generate the same number of random integers that are equal to the batch size
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        
        # initializing the 5 variables which will store the random values extracted from the replay buffer
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
        
        # now we loop through all the random integers found in the ind variable and we use them to pull data
        # from the replay buffer
        for i in ind: 
            # 1st step in the loop simply pulls a single iteration and stores them in the 5 variables below
            state, next_state, action, reward, done = self.storage[i]
            
            # the following 5 lines of code are part of the loop and we basically append the data we just pulled
            # from the single iteration of the replay buffer (above) and append it to the list.
            # the lists below are then returned as a result of this function
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
            
        # notice that before returning the lists, they are converted into numpy arrays as that's how pytorch would be able to understand it
        # also please note that rewards need to be reshaped from horizontal entries to vertical for the rewards array (it will still be a numpy array)
        return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

## Step 3: We build two neural networks for the two Critic models and two neural networks for the two Critic targets

In [5]:
# We now create a class for the Actor neural network. This class will be used for both the actor model and actor target neural network
# also notice that this class is inheriting the torch.nn.module class (here's a link to it's documentation https://pytorch.org/docs/stable/generated/torch.nn.Module.html)
class Actor(nn.Module):
    
    # the first function i.e. the initialization function will take the total number of states, total actions and max actions
    # using the data provided, it will create a 3 layer neural network (1 input layer, 1 hidden layer and 1 output/action layer)
    # the number of nodes in the input layer and the number nodes in the output/action layer will defined based on the parameters 
    # that will be provided when creating an object of this class
    def __init__(self, state_dim, action_dim, max_action):
        # this command is required for the object to inherit the nn.module class (don't think too much about it... just use it)
        super(Actor, self).__init__()
        
        # now to start creating the neural network from here
        self.layer_1 = nn.Linear(state_dim, 400)
        self.layer_2 = nn.Linear(400,300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action
        
    # next function is the forward propagation function that simply connects the layers and forms the neural network
    # the x is a parameter and the input state will be fed into this parameter to be able to get the actions
    # F is a functional module of torch
    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        
        # the reason we are multiplying the 3rd layer with tanh activation function with self.max_action is because
        # the output of the tanh activation will be from -1 to +1, we need to convert it to the continuous value of the actions
        # hence we multiply the number with the max_action
        x = self.max_action * torch.tanh(self.layer_3(x))
        
        return x

## Step 3: We build two neural networks for the two Critic models and two neural networks for the two Critic targets

In [6]:
# We now create a class for the Critic neural network. This class will be used for both the 2 critic model and 2 critic target neural networks
# also notice that this class is inheriting the torch.nn.module class (here's a link to it's documentation https://pytorch.org/docs/stable/generated/torch.nn.Module.html)
class Critic(nn.Module):
    
    # the first function i.e. the initialization function will take the total number of states, total actions only. max_actions will not be required here
    # using the data provided, it will create a 3 layer neural network (1 input layer, 1 hidden layer and 1 output layer)
    # the number of nodes in the input layer will defined based on the state_dim parameter + the action_dim parameter and the output layer will only give one value
    # the job of the critic network is to generate q-values and to get that i uses the state and the action data
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        
        # there will be two neural networks that will be required which will need to generate two q-values (one each)
        # here's the first Critic Neural Network, it takes the number of states and number of actions as input nodes
        # and the output layer will be the hidden layer with 400 nodes.
        self.layer_1_1 = nn.Linear(state_dim + action_dim, 400)
        self.layer_1_2 = nn.Linear(400, 300)
        
        # as explained above, the output layer has just one node as it will return only a single q-value
        self.layer_1_3 = nn.Linear(300, 1)
        
        # here's the second Critic Neural Network, it takes the number of states and number of actions as input nodes
        # and the output layer will be the hidden layer with 400 nodes.
        self.layer_2_1 = nn.Linear(state_dim + action_dim, 400)
        self.layer_2_2 = nn.Linear(400, 300)
        
        # as explained above, the output layer has just one node as it will return only a single q-value
        self.layer_2_3 = nn.Linear(300, 1)
        
    # now to setup the forward propogation of the layers created above
    # please note that this time apart from just the input state, there will be one more parameter and that will be the action taken
    def forward(self, x, u):
        
        # the following code helps concatenate the input state (x) and the action taken (u) into one using torch
        xu = torch.cat([x,u], 1)
        
        # now let's begin the forward propagation for the first Critic Neural Network
        x1 = F.relu(self.layer_1_1(xu))
        x1 = F.relu(self.layer_1_2(x1))
        x1 = self.layer_1_3(x1) # there's no activation function here because we need the final value as q-value
        
        # now let's begin the forward propagation for the second Critic Neural Network
        x2 = F.relu(self.layer_2_1(xu))
        x2 = F.relu(self.layer_2_2(x2))
        x2 = self.layer_2_3(x2) # there's no activation function here because we need the final value as q-value
        
        return x1, x2
    
    # the function below is very similar to the one above but is used to forward propagate only the first critic neural network
    # this is used when we do gradient ascent using the single critic neural network
    def Q1(self, x, u):
        xu = torch.cat([x,u], 1)
        x1 = F.relu(self.layer_1_1(xu))
        x1 = F.relu(self.layer_1_2(x1))
        x1 = self.layer_1_3(x1)
        return x1

## Steps 4 to 15: Training Process

In [7]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
# Building the whole Training Process into a class

# this is the key TD3 class where all the required objects are created and the primary functions done
class TD3(object):
    
    # the init function will require the 3 main parameters i.e. the number of states, number of actions and 
    # what is the maximum action that can be taken
    def __init__(self, state_dim, action_dim, max_action):
        
        # first the actor model and we pass the tensor to the device that will run the math (i.e. cuda/graphic card or CPU)
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        
        # next the actor target 
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        
        # this command below will allow to load the pre-trained model in the future (i.e the parameter weights and biases)
        self.actor_target.load_state_dict(self.actor.state_dict())
        
        # next we need to define the optimizer when we perform stochastic gradient descent
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

        # next we create the object for the critic model which will take the state and the action
        self.critic = Critic(state_dim, action_dim).to(device)
        
        # next the critic target
        self.critic_target = Critic(state_dim, action_dim).to(device)
        
        # then similar to the actor target model getting loaded, we do the same for critic target model
        self.critic_target.load_state_dict(self.critic.state_dict())
        
        # now we initialize the optimizer for SGD for the critic
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        
        # here we just pass the max_action parameter and load it to the max_action variable within the object
        self.max_action = max_action
        
    # the below function get the action to be taken based on the state passed to it as a parameter
    def select_action(self, state):
        # first the state is converted to a tensor so that it can run through the neural network
        state = torch.Tensor(state.reshape(1,-1)).to(device)
        
        # the return function below uses the actor object created above (not the actor_target) 
        # and passes the state we just got and runs it through the forward function created 
        # under the Actor class
        return self.actor(state).cpu().data.numpy().flatten()
    
    # the next function actuall describes how the training of the model will be done it will use the replay buffer, 
    # the number of times you want to iterate the training process, the batch size, what should be gamma i.e. the discount,
    # the value of tau which is used with copying the weights from the actor model to the actor_target neural network,
    # policy noise to xxxx, noise clip value to ensure we don't exceed the max_action and policy_freq which checks 
    # how often the weights and biases of the actor model will get copied to the actor target neural network
    def train(self, replay_buffer, iterations, batch_size = 100, discount = 0.99, tau = 0.005, policy_noise = 0.2, noise_clip = 0.5, policy_freq = 2):
        
        # now we iterate through random samples from the replay buffer and train the model
        for it in range(iterations):
            
            # for every iteration sample a batch of transitions (s, s', a, r) from replay_buffer memory 
            # i.e. s = current state, s' = next state a = action taken to reach next state and 
            # r = reward received when entered next state
            # please note that this will be a bunch of samples and not a single one (as described by the batch_size)
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            
            # now we convert the individual batches to tensors and load them to new variables
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)
            
            # we now try and get the next action based on the next state i.e. s' and this will give us a new action i.e. a'
            next_action = self.actor_target(next_state)
            
            # now we add Gaussian noise to the next action a' and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            
            # now we need to get the q-values using the critic neural networks, so we take next state (s') and next action (a')
            # and get two q-values... refer to the Critic class's forward fucntion
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            
            # now as per the model, we need the minimum of the two q-values... we need to use a torch function for this
            target_Q = torch.min(target_Q1, target_Q2)
            
            # now as per the documentation, we get the final target of the two Critic models, 
            # which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
            # the value of min(Qt1, Qt2) has been calculated above and we got the variable target_Q, let's use that
            # the 1 - done part is there to ensure that if we are at the done stage then the q_value should be 0 + the reward
            target_Q = reward + ((1 - done) * discount * target_Q).detach()

            # now that we get the target_Q (the target q-value) we now need to get the q-value that the model gives
            # we get this from the critic model (not the critic target we used above) and passing 
            # the current state (s) and action (a)that takes us to the next state  
            current_Q1, current_Q2 = self.critic(state, action)
            
            # now we need to compute the loss between the two q-values viz. target_Q and the two current q-values
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            
            # now that we have the critic loss, we can now do SGD to back-propogate and optimize our weights and biases
            # https://pytorch.org/docs/stable/optim.html
            
            # first we initialize the optimizer
            self.critic_optimizer.zero_grad()
            
            # backpropogate using the critic_loss 
            critic_loss.backward()
            
            # update the parameters of the optimizer
            self.critic_optimizer.step()
            
            # now we need a way to update the actor_target weights and biases as well, we use the policy_freq value to ensure
            # every time we hit that policy_freq number, we copy the weights and biases from the actor_model to actor_target
            # please note it's not a complete copy and we use tau to have some difference between the two
            if it % policy_freq == 0:
                
                # the following is a multipart function in the same line, first look at self.actor(state), this runs the forward
                # function in the Actor class and gives the Action to be taken. That action then feeds into the Q1 function of 
                # the critic_model object of the critic class (note this is not critic_target, but critic model)
                # the result is a q-value, that q-value is converted to it's negative value and gives its mean
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                
                # now we update the weights and biases for the actor model using the optimizer
                # https://pytorch.org/docs/stable/optim.html
                
                # first to initialize the optimizer
                self.actor_optimizer.zero_grad()
                
                # back-propogate the loss and get new values for weights and biases
                actor_loss.backward()
                
                # update the parameters of the optimizer
                self.actor_optimizer.step()
                
                # now let's update the weights and biases of the actor_target from actor model using polyak averaging
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                
                # now let's update the weights and biases of the critic_target from critic model using polyak averaging
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                
    # now we need a function that can be used to save a trained model for future use
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
        

    # now we need a function that can be used to load a trained model 
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

## We make a function that evaluates the policy by calculating its average reward over 10 episodes

In [9]:
def evaluate_policy(policy, eval_episodes=10):
    
    # first initialize the average reward to 0. (The . is to ensure it's a float type and not int)
    avg_reward = 0.
    
    # now let's iterate the number of times defined by eval_episodes to get the average reward
    for _ in range(eval_episodes):
        # reset the environment
        obs = env.reset()
        done = False
        
        while not done:
            action = policy.select_action(np.array(obs)) # you will find the select_action function in the TD3 class
            obs, reward, done, _ = env.step(action) # this step actually takes the action and gets the new state and reward
            avg_reward += reward
    
    # avg reward till now is actually a cumulative of all rewards, we divide it by the episodes to get the average
    avg_reward /= eval_episodes
    
    print('--------------------------------------------------------')
    print('Average Reward over the Evaluation Step: %f' % (avg_reward))
    print('--------------------------------------------------------')
    
    return avg_reward

## We set the parameters

In [10]:
env_name = "Pendulum-v1" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a file name for the two saved models: the Actor and Critic models

In [11]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_Pendulum-v1_0
---------------------------------------


## We create a folder inside which will be saved the trained models

In [12]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## We create the gym environment

In [13]:
env = gym.make(env_name)

## We set seeds and we get the necessary information on the states and actions in the chosen environment

In [14]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

## We create the policy network (the Actor model)

In [15]:
policy = TD3(state_dim, action_dim, max_action)

## We create the Experience Replay memory

In [16]:
replay_buffer = ReplayBuffer()

## We define a list where all the evaluation results over 10 episodes are stored

In [17]:
evaluations = [evaluate_policy(policy)]

--------------------------------------------------------
Average Reward over the Evaluation Step: -1400.479691
--------------------------------------------------------


## We create a new folder directory in which the final results (videos of the agent) will be populated

In [18]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

## We initialize the variables

In [19]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [20]:
# to start the training, we first need to define the number of timesteps we want to run it
while total_timesteps < max_timesteps:
    # if the episode is done
    if done:
        # if we have performed some steps and got done, then let's complete the evaluation and then save the model
        # after that we reset the variables and start training again
        if total_timesteps != 0:
            print('timesteps not 0')
            print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
            policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
            
        # here's where we evaluate the episode and save the policy
        if timesteps_since_eval >= eval_freq:
            print('timesteps since eval more than eval freq')
            timesteps_since_eval %= eval_freq
            evaluations.append(evaluate_policy(policy))
            policy.save(file_name,directory='./pytorch_models')
            np.save('./results/%s' % (file_name), evaluations)
            
        # when the training step is done, we reset the state of the environment
        obs = env.reset()
        
        # Set the done to false again
        done = False
        
        # set teh rewards and episode timesteps to zero
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1
    # to get some initial data, we will run random actions for the number of timesteps defined in the start_timesteps variable
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else: # meaning after we reach the number of timesteps defined in the start_timesteps variable
        # get a proper action from the select_action function by passing obs variable for state
        action = policy.select_action(np.array(obs))
        # if the explore_noise parameter is not 0, we add noise to the action and we clip it
        if expl_noise != 0:
            action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
        
    # lets send the new action we have decided to go with and send that to the environment and get the next state and the reward
    new_obs, reward, done, _ = env.step(action)

    # check if the episode is done
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

    # let's add the new reward to the total episode reward
    episode_reward += reward

    # the following command will store the new transition to the the experience replay memory (i.e. the replaybuffer)
    replay_buffer.add((obs, new_obs, action, reward, done_bool))

    # now we update the state, the episode timestep, the total timesteps and the timesteps since the evaluation of the policy
    obs = new_obs
    episode_timesteps += 1
    total_timesteps += 1
    timesteps_since_eval += 1
        
# We add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save('%s' % (file_name), directory='./pytorch_models')
np.save('./results/%s' % (file_name), evaluations)

timesteps not 0
Total Timesteps: 200 Episode Num: 1 Reward: -1289.9048630392751
timesteps not 0
Total Timesteps: 400 Episode Num: 2 Reward: -863.1462610238716
timesteps not 0
Total Timesteps: 600 Episode Num: 3 Reward: -1459.964404998601
timesteps not 0
Total Timesteps: 800 Episode Num: 4 Reward: -1220.8036976946337
timesteps not 0
Total Timesteps: 1000 Episode Num: 5 Reward: -933.5313329243754
timesteps not 0
Total Timesteps: 1200 Episode Num: 6 Reward: -1051.1878104786335
timesteps not 0
Total Timesteps: 1400 Episode Num: 7 Reward: -1106.930946054087
timesteps not 0
Total Timesteps: 1600 Episode Num: 8 Reward: -896.372327948861
timesteps not 0
Total Timesteps: 1800 Episode Num: 9 Reward: -1405.8152725102436
timesteps not 0
Total Timesteps: 2000 Episode Num: 10 Reward: -954.734615665277
timesteps not 0
Total Timesteps: 2200 Episode Num: 11 Reward: -1066.882866694956
timesteps not 0
Total Timesteps: 2400 Episode Num: 12 Reward: -1829.7846441642628
timesteps not 0
Total Timesteps: 2600 

timesteps not 0
Total Timesteps: 18800 Episode Num: 94 Reward: -117.41923239265073
timesteps not 0
Total Timesteps: 19000 Episode Num: 95 Reward: -240.5224746545498
timesteps not 0
Total Timesteps: 19200 Episode Num: 96 Reward: -288.3300396804313
timesteps not 0
Total Timesteps: 19400 Episode Num: 97 Reward: -117.25058966192036
timesteps not 0
Total Timesteps: 19600 Episode Num: 98 Reward: -228.43820101843684
timesteps not 0
Total Timesteps: 19800 Episode Num: 99 Reward: -120.5016277629517
timesteps not 0
Total Timesteps: 20000 Episode Num: 100 Reward: -225.37586979560376
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -135.414201
--------------------------------------------------------
timesteps not 0
Total Timesteps: 20200 Episode Num: 101 Reward: -238.9532640323789
timesteps not 0
Total Timesteps: 20400 Episode Num: 102 Reward: -228.84188849863813
timesteps not 0
Total Timesteps: 20600 Episode

timesteps not 0
Total Timesteps: 36400 Episode Num: 182 Reward: -229.7156381458689
timesteps not 0
Total Timesteps: 36600 Episode Num: 183 Reward: -128.8626778151869
timesteps not 0
Total Timesteps: 36800 Episode Num: 184 Reward: -2.82750660417411
timesteps not 0
Total Timesteps: 37000 Episode Num: 185 Reward: -120.34828515958964
timesteps not 0
Total Timesteps: 37200 Episode Num: 186 Reward: -120.20335885141873
timesteps not 0
Total Timesteps: 37400 Episode Num: 187 Reward: -223.71667621233004
timesteps not 0
Total Timesteps: 37600 Episode Num: 188 Reward: -120.3340146443045
timesteps not 0
Total Timesteps: 37800 Episode Num: 189 Reward: -129.287869638507
timesteps not 0
Total Timesteps: 38000 Episode Num: 190 Reward: -125.54610983826875
timesteps not 0
Total Timesteps: 38200 Episode Num: 191 Reward: -354.73172030160976
timesteps not 0
Total Timesteps: 38400 Episode Num: 192 Reward: -2.6903150420105018
timesteps not 0
Total Timesteps: 38600 Episode Num: 193 Reward: -123.74763311625783

timesteps not 0
Total Timesteps: 54600 Episode Num: 273 Reward: -128.0210673100832
timesteps not 0
Total Timesteps: 54800 Episode Num: 274 Reward: -120.68422541602753
timesteps not 0
Total Timesteps: 55000 Episode Num: 275 Reward: -121.79406480350187
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -148.505864
--------------------------------------------------------
timesteps not 0
Total Timesteps: 55200 Episode Num: 276 Reward: -119.05875708269804
timesteps not 0
Total Timesteps: 55400 Episode Num: 277 Reward: -234.14620458684087
timesteps not 0
Total Timesteps: 55600 Episode Num: 278 Reward: -226.42976239367866
timesteps not 0
Total Timesteps: 55800 Episode Num: 279 Reward: -114.2557913159124
timesteps not 0
Total Timesteps: 56000 Episode Num: 280 Reward: -124.93299557534186
timesteps not 0
Total Timesteps: 56200 Episode Num: 281 Reward: -124.82863046754458
timesteps not 0
Total Timesteps: 56400

timesteps not 0
Total Timesteps: 72200 Episode Num: 361 Reward: -124.34096883871105
timesteps not 0
Total Timesteps: 72400 Episode Num: 362 Reward: -129.5428378679373
timesteps not 0
Total Timesteps: 72600 Episode Num: 363 Reward: -235.69718921534158
timesteps not 0
Total Timesteps: 72800 Episode Num: 364 Reward: -225.91459579552895
timesteps not 0
Total Timesteps: 73000 Episode Num: 365 Reward: -125.71527932532689
timesteps not 0
Total Timesteps: 73200 Episode Num: 366 Reward: -125.46355548114289
timesteps not 0
Total Timesteps: 73400 Episode Num: 367 Reward: -1.7882575069345976
timesteps not 0
Total Timesteps: 73600 Episode Num: 368 Reward: -125.79993314788909
timesteps not 0
Total Timesteps: 73800 Episode Num: 369 Reward: -226.98536633374502
timesteps not 0
Total Timesteps: 74000 Episode Num: 370 Reward: -122.58918502388775
timesteps not 0
Total Timesteps: 74200 Episode Num: 371 Reward: -122.97248761960958
timesteps not 0
Total Timesteps: 74400 Episode Num: 372 Reward: -118.79213536

--------------------------------------------------------
Average Reward over the Evaluation Step: -82.900740
--------------------------------------------------------
timesteps not 0
Total Timesteps: 90200 Episode Num: 451 Reward: -116.2757585624621
timesteps not 0
Total Timesteps: 90400 Episode Num: 452 Reward: -117.12967273608695
timesteps not 0
Total Timesteps: 90600 Episode Num: 453 Reward: -223.89511792668316
timesteps not 0
Total Timesteps: 90800 Episode Num: 454 Reward: -124.6047951528656
timesteps not 0
Total Timesteps: 91000 Episode Num: 455 Reward: -118.9506266993242
timesteps not 0
Total Timesteps: 91200 Episode Num: 456 Reward: -118.74880582684956
timesteps not 0
Total Timesteps: 91400 Episode Num: 457 Reward: -123.01892131006754
timesteps not 0
Total Timesteps: 91600 Episode Num: 458 Reward: -124.3473523419812
timesteps not 0
Total Timesteps: 91800 Episode Num: 459 Reward: -114.05317907563325
timesteps not 0
Total Timesteps: 92000 Episode Num: 460 Reward: -0.875972735551465

timesteps not 0
Total Timesteps: 108000 Episode Num: 540 Reward: -231.78209613118
timesteps not 0
Total Timesteps: 108200 Episode Num: 541 Reward: -124.96620698386768
timesteps not 0
Total Timesteps: 108400 Episode Num: 542 Reward: -120.90408748502412
timesteps not 0
Total Timesteps: 108600 Episode Num: 543 Reward: -2.2279485345692764
timesteps not 0
Total Timesteps: 108800 Episode Num: 544 Reward: -2.0943067543730542
timesteps not 0
Total Timesteps: 109000 Episode Num: 545 Reward: -2.6458114470477936
timesteps not 0
Total Timesteps: 109200 Episode Num: 546 Reward: -128.28502533487833
timesteps not 0
Total Timesteps: 109400 Episode Num: 547 Reward: -232.49206144252324
timesteps not 0
Total Timesteps: 109600 Episode Num: 548 Reward: -121.9551834571182
timesteps not 0
Total Timesteps: 109800 Episode Num: 549 Reward: -121.42734212536386
timesteps not 0
Total Timesteps: 110000 Episode Num: 550 Reward: -361.5204422078826
timesteps since eval more than eval freq
-----------------------------

timesteps not 0
Total Timesteps: 125400 Episode Num: 627 Reward: -297.13461591527397
timesteps not 0
Total Timesteps: 125600 Episode Num: 628 Reward: -122.45777657127742
timesteps not 0
Total Timesteps: 125800 Episode Num: 629 Reward: -218.06622353351693
timesteps not 0
Total Timesteps: 126000 Episode Num: 630 Reward: -117.18664701365998
timesteps not 0
Total Timesteps: 126200 Episode Num: 631 Reward: -126.57743214452499
timesteps not 0
Total Timesteps: 126400 Episode Num: 632 Reward: -126.26968008571565
timesteps not 0
Total Timesteps: 126600 Episode Num: 633 Reward: -117.07002452967068
timesteps not 0
Total Timesteps: 126800 Episode Num: 634 Reward: -125.37050797416684
timesteps not 0
Total Timesteps: 127000 Episode Num: 635 Reward: -0.5028864099756802
timesteps not 0
Total Timesteps: 127200 Episode Num: 636 Reward: -118.8385150501174
timesteps not 0
Total Timesteps: 127400 Episode Num: 637 Reward: -116.52848765664967
timesteps not 0
Total Timesteps: 127600 Episode Num: 638 Reward: -

timesteps not 0
Total Timesteps: 143400 Episode Num: 717 Reward: -116.82360853562332
timesteps not 0
Total Timesteps: 143600 Episode Num: 718 Reward: -120.50627145458307
timesteps not 0
Total Timesteps: 143800 Episode Num: 719 Reward: -125.51456265895854
timesteps not 0
Total Timesteps: 144000 Episode Num: 720 Reward: -122.02853182603275
timesteps not 0
Total Timesteps: 144200 Episode Num: 721 Reward: -116.09354901426764
timesteps not 0
Total Timesteps: 144400 Episode Num: 722 Reward: -228.2848305485938
timesteps not 0
Total Timesteps: 144600 Episode Num: 723 Reward: -114.59253917194026
timesteps not 0
Total Timesteps: 144800 Episode Num: 724 Reward: -117.76304700269148
timesteps not 0
Total Timesteps: 145000 Episode Num: 725 Reward: -120.16317785807085
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -111.062232
--------------------------------------------------------
timesteps not 0
Total Timest

timesteps not 0
Total Timesteps: 160800 Episode Num: 804 Reward: -129.4791110160085
timesteps not 0
Total Timesteps: 161000 Episode Num: 805 Reward: -239.8346794531895
timesteps not 0
Total Timesteps: 161200 Episode Num: 806 Reward: -124.95831050750796
timesteps not 0
Total Timesteps: 161400 Episode Num: 807 Reward: -12.445275007919044
timesteps not 0
Total Timesteps: 161600 Episode Num: 808 Reward: -124.3041747407093
timesteps not 0
Total Timesteps: 161800 Episode Num: 809 Reward: -363.9375126143566
timesteps not 0
Total Timesteps: 162000 Episode Num: 810 Reward: -128.60082033757485
timesteps not 0
Total Timesteps: 162200 Episode Num: 811 Reward: -246.20064867964618
timesteps not 0
Total Timesteps: 162400 Episode Num: 812 Reward: -129.53128110129978
timesteps not 0
Total Timesteps: 162600 Episode Num: 813 Reward: -5.355867985224817
timesteps not 0
Total Timesteps: 162800 Episode Num: 814 Reward: -120.00716630957822
timesteps not 0
Total Timesteps: 163000 Episode Num: 815 Reward: -126.

timesteps not 0
Total Timesteps: 178800 Episode Num: 894 Reward: -233.4155298554491
timesteps not 0
Total Timesteps: 179000 Episode Num: 895 Reward: -116.29894443858566
timesteps not 0
Total Timesteps: 179200 Episode Num: 896 Reward: -223.34617401385776
timesteps not 0
Total Timesteps: 179400 Episode Num: 897 Reward: -230.6021541360065
timesteps not 0
Total Timesteps: 179600 Episode Num: 898 Reward: -126.0245838863683
timesteps not 0
Total Timesteps: 179800 Episode Num: 899 Reward: -1.2288234685450883
timesteps not 0
Total Timesteps: 180000 Episode Num: 900 Reward: -124.99994639908014
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -124.372689
--------------------------------------------------------
timesteps not 0
Total Timesteps: 180200 Episode Num: 901 Reward: -121.99148593394166
timesteps not 0
Total Timesteps: 180400 Episode Num: 902 Reward: -123.85371000252194
timesteps not 0
Total Timestep

timesteps not 0
Total Timesteps: 196200 Episode Num: 981 Reward: -224.3146176232199
timesteps not 0
Total Timesteps: 196400 Episode Num: 982 Reward: -113.8136137486279
timesteps not 0
Total Timesteps: 196600 Episode Num: 983 Reward: -252.0987701515706
timesteps not 0
Total Timesteps: 196800 Episode Num: 984 Reward: -236.91524657069442
timesteps not 0
Total Timesteps: 197000 Episode Num: 985 Reward: -117.09489049346874
timesteps not 0
Total Timesteps: 197200 Episode Num: 986 Reward: -117.78224247638822
timesteps not 0
Total Timesteps: 197400 Episode Num: 987 Reward: -119.56469354912625
timesteps not 0
Total Timesteps: 197600 Episode Num: 988 Reward: -229.9944373379731
timesteps not 0
Total Timesteps: 197800 Episode Num: 989 Reward: -115.14355575302403
timesteps not 0
Total Timesteps: 198000 Episode Num: 990 Reward: -124.59546436587557
timesteps not 0
Total Timesteps: 198200 Episode Num: 991 Reward: -1.7095706580765682
timesteps not 0
Total Timesteps: 198400 Episode Num: 992 Reward: -1.6

timesteps not 0
Total Timesteps: 214000 Episode Num: 1070 Reward: -322.6195823359443
timesteps not 0
Total Timesteps: 214200 Episode Num: 1071 Reward: -9.074466103215194
timesteps not 0
Total Timesteps: 214400 Episode Num: 1072 Reward: -8.850920762707783
timesteps not 0
Total Timesteps: 214600 Episode Num: 1073 Reward: -222.9054556360379
timesteps not 0
Total Timesteps: 214800 Episode Num: 1074 Reward: -129.20033013383662
timesteps not 0
Total Timesteps: 215000 Episode Num: 1075 Reward: -239.54180009449087
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -149.091468
--------------------------------------------------------
timesteps not 0
Total Timesteps: 215200 Episode Num: 1076 Reward: -125.5417885307917
timesteps not 0
Total Timesteps: 215400 Episode Num: 1077 Reward: -239.37167681961805
timesteps not 0
Total Timesteps: 215600 Episode Num: 1078 Reward: -232.5352000321118
timesteps not 0
Total Ti

timesteps not 0
Total Timesteps: 231200 Episode Num: 1156 Reward: -241.4938892748413
timesteps not 0
Total Timesteps: 231400 Episode Num: 1157 Reward: -127.99696119879515
timesteps not 0
Total Timesteps: 231600 Episode Num: 1158 Reward: -223.28561147626436
timesteps not 0
Total Timesteps: 231800 Episode Num: 1159 Reward: -128.97252206193326
timesteps not 0
Total Timesteps: 232000 Episode Num: 1160 Reward: -128.6797018472803
timesteps not 0
Total Timesteps: 232200 Episode Num: 1161 Reward: -9.194321900641441
timesteps not 0
Total Timesteps: 232400 Episode Num: 1162 Reward: -122.50314640339523
timesteps not 0
Total Timesteps: 232600 Episode Num: 1163 Reward: -292.87206048225676
timesteps not 0
Total Timesteps: 232800 Episode Num: 1164 Reward: -120.47378015057394
timesteps not 0
Total Timesteps: 233000 Episode Num: 1165 Reward: -125.42355960470671
timesteps not 0
Total Timesteps: 233200 Episode Num: 1166 Reward: -247.82780396339052
timesteps not 0
Total Timesteps: 233400 Episode Num: 1167

timesteps not 0
Total Timesteps: 249000 Episode Num: 1245 Reward: -234.69336557303617
timesteps not 0
Total Timesteps: 249200 Episode Num: 1246 Reward: -122.19495337657851
timesteps not 0
Total Timesteps: 249400 Episode Num: 1247 Reward: -248.73103344055082
timesteps not 0
Total Timesteps: 249600 Episode Num: 1248 Reward: -225.38430411670035
timesteps not 0
Total Timesteps: 249800 Episode Num: 1249 Reward: -118.10932847848302
timesteps not 0
Total Timesteps: 250000 Episode Num: 1250 Reward: -129.77557925580174
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -158.557349
--------------------------------------------------------
timesteps not 0
Total Timesteps: 250200 Episode Num: 1251 Reward: -3.415306985345081
timesteps not 0
Total Timesteps: 250400 Episode Num: 1252 Reward: -133.77867966090577
timesteps not 0
Total Timesteps: 250600 Episode Num: 1253 Reward: -338.44633626447006
timesteps not 0
Tot

timesteps not 0
Total Timesteps: 266200 Episode Num: 1331 Reward: -122.98364225449852
timesteps not 0
Total Timesteps: 266400 Episode Num: 1332 Reward: -124.46672803635788
timesteps not 0
Total Timesteps: 266600 Episode Num: 1333 Reward: -121.0453436030708
timesteps not 0
Total Timesteps: 266800 Episode Num: 1334 Reward: -129.55753233411704
timesteps not 0
Total Timesteps: 267000 Episode Num: 1335 Reward: -120.24637265580381
timesteps not 0
Total Timesteps: 267200 Episode Num: 1336 Reward: -124.65235536205614
timesteps not 0
Total Timesteps: 267400 Episode Num: 1337 Reward: -7.709229331517513
timesteps not 0
Total Timesteps: 267600 Episode Num: 1338 Reward: -9.227795457506307
timesteps not 0
Total Timesteps: 267800 Episode Num: 1339 Reward: -130.66373227572856
timesteps not 0
Total Timesteps: 268000 Episode Num: 1340 Reward: -124.83049123196342
timesteps not 0
Total Timesteps: 268200 Episode Num: 1341 Reward: -7.486822030110225
timesteps not 0
Total Timesteps: 268400 Episode Num: 1342 

timesteps not 0
Total Timesteps: 284000 Episode Num: 1420 Reward: -777.6387097558735
timesteps not 0
Total Timesteps: 284200 Episode Num: 1421 Reward: -900.1219146661598
timesteps not 0
Total Timesteps: 284400 Episode Num: 1422 Reward: -225.36940619329724
timesteps not 0
Total Timesteps: 284600 Episode Num: 1423 Reward: -125.66566916275167
timesteps not 0
Total Timesteps: 284800 Episode Num: 1424 Reward: -2.0461739298979946
timesteps not 0
Total Timesteps: 285000 Episode Num: 1425 Reward: -122.00460392189231
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -135.343997
--------------------------------------------------------
timesteps not 0
Total Timesteps: 285200 Episode Num: 1426 Reward: -125.14548530634455
timesteps not 0
Total Timesteps: 285400 Episode Num: 1427 Reward: -222.00327744626108
timesteps not 0
Total Timesteps: 285600 Episode Num: 1428 Reward: -4.98693440635118
timesteps not 0
Total 

timesteps not 0
Total Timesteps: 301200 Episode Num: 1506 Reward: -131.39929942793012
timesteps not 0
Total Timesteps: 301400 Episode Num: 1507 Reward: -131.086907890606
timesteps not 0
Total Timesteps: 301600 Episode Num: 1508 Reward: -253.41322329035262
timesteps not 0
Total Timesteps: 301800 Episode Num: 1509 Reward: -135.28837473156395
timesteps not 0
Total Timesteps: 302000 Episode Num: 1510 Reward: -125.59514604187497
timesteps not 0
Total Timesteps: 302200 Episode Num: 1511 Reward: -310.44983526279543
timesteps not 0
Total Timesteps: 302400 Episode Num: 1512 Reward: -248.08702174464918
timesteps not 0
Total Timesteps: 302600 Episode Num: 1513 Reward: -124.90984708593267
timesteps not 0
Total Timesteps: 302800 Episode Num: 1514 Reward: -239.46865772221403
timesteps not 0
Total Timesteps: 303000 Episode Num: 1515 Reward: -128.00512264781614
timesteps not 0
Total Timesteps: 303200 Episode Num: 1516 Reward: -234.80143586444268
timesteps not 0
Total Timesteps: 303400 Episode Num: 151

timesteps not 0
Total Timesteps: 319000 Episode Num: 1595 Reward: -129.07841768792807
timesteps not 0
Total Timesteps: 319200 Episode Num: 1596 Reward: -16.320169053105467
timesteps not 0
Total Timesteps: 319400 Episode Num: 1597 Reward: -14.264199675487845
timesteps not 0
Total Timesteps: 319600 Episode Num: 1598 Reward: -138.12505176060142
timesteps not 0
Total Timesteps: 319800 Episode Num: 1599 Reward: -136.8189728354454
timesteps not 0
Total Timesteps: 320000 Episode Num: 1600 Reward: -368.80149222421437
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -135.329445
--------------------------------------------------------
timesteps not 0
Total Timesteps: 320200 Episode Num: 1601 Reward: -12.627675194297321
timesteps not 0
Total Timesteps: 320400 Episode Num: 1602 Reward: -130.85852377775464
timesteps not 0
Total Timesteps: 320600 Episode Num: 1603 Reward: -322.4448409215611
timesteps not 0
Tota

timesteps not 0
Total Timesteps: 336200 Episode Num: 1681 Reward: -233.30960912585306
timesteps not 0
Total Timesteps: 336400 Episode Num: 1682 Reward: -240.53719381533244
timesteps not 0
Total Timesteps: 336600 Episode Num: 1683 Reward: -125.4993065550555
timesteps not 0
Total Timesteps: 336800 Episode Num: 1684 Reward: -124.95032661058536
timesteps not 0
Total Timesteps: 337000 Episode Num: 1685 Reward: -123.32569067410466
timesteps not 0
Total Timesteps: 337200 Episode Num: 1686 Reward: -125.09038269353549
timesteps not 0
Total Timesteps: 337400 Episode Num: 1687 Reward: -344.0881198228079
timesteps not 0
Total Timesteps: 337600 Episode Num: 1688 Reward: -234.36371642716534
timesteps not 0
Total Timesteps: 337800 Episode Num: 1689 Reward: -230.91619384758886
timesteps not 0
Total Timesteps: 338000 Episode Num: 1690 Reward: -237.4661593555551
timesteps not 0
Total Timesteps: 338200 Episode Num: 1691 Reward: -232.52597195554904
timesteps not 0
Total Timesteps: 338400 Episode Num: 1692

timesteps not 0
Total Timesteps: 354000 Episode Num: 1770 Reward: -118.33478106069916
timesteps not 0
Total Timesteps: 354200 Episode Num: 1771 Reward: -123.25005062049472
timesteps not 0
Total Timesteps: 354400 Episode Num: 1772 Reward: -224.1305296232785
timesteps not 0
Total Timesteps: 354600 Episode Num: 1773 Reward: -116.95627662117593
timesteps not 0
Total Timesteps: 354800 Episode Num: 1774 Reward: -3.5409929466813406
timesteps not 0
Total Timesteps: 355000 Episode Num: 1775 Reward: -126.41967122969346
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -114.305852
--------------------------------------------------------
timesteps not 0
Total Timesteps: 355200 Episode Num: 1776 Reward: -220.05740862454638
timesteps not 0
Total Timesteps: 355400 Episode Num: 1777 Reward: -323.46020477923247
timesteps not 0
Total Timesteps: 355600 Episode Num: 1778 Reward: -220.093716531522
timesteps not 0
Total

timesteps not 0
Total Timesteps: 371200 Episode Num: 1856 Reward: -125.44927787957042
timesteps not 0
Total Timesteps: 371400 Episode Num: 1857 Reward: -113.62289000346344
timesteps not 0
Total Timesteps: 371600 Episode Num: 1858 Reward: -128.16997232788123
timesteps not 0
Total Timesteps: 371800 Episode Num: 1859 Reward: -114.8576007932158
timesteps not 0
Total Timesteps: 372000 Episode Num: 1860 Reward: -128.9686023042708
timesteps not 0
Total Timesteps: 372200 Episode Num: 1861 Reward: -221.50073026534372
timesteps not 0
Total Timesteps: 372400 Episode Num: 1862 Reward: -120.91254928932783
timesteps not 0
Total Timesteps: 372600 Episode Num: 1863 Reward: -127.29313038640443
timesteps not 0
Total Timesteps: 372800 Episode Num: 1864 Reward: -219.69821491001176
timesteps not 0
Total Timesteps: 373000 Episode Num: 1865 Reward: -290.3168817360285
timesteps not 0
Total Timesteps: 373200 Episode Num: 1866 Reward: -119.08961090880366
timesteps not 0
Total Timesteps: 373400 Episode Num: 1867

timesteps not 0
Total Timesteps: 389000 Episode Num: 1945 Reward: -123.66242758217753
timesteps not 0
Total Timesteps: 389200 Episode Num: 1946 Reward: -116.79074764917851
timesteps not 0
Total Timesteps: 389400 Episode Num: 1947 Reward: -133.05375063975907
timesteps not 0
Total Timesteps: 389600 Episode Num: 1948 Reward: -217.9592665360316
timesteps not 0
Total Timesteps: 389800 Episode Num: 1949 Reward: -220.4627153992924
timesteps not 0
Total Timesteps: 390000 Episode Num: 1950 Reward: -121.09846228713704
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -161.848018
--------------------------------------------------------
timesteps not 0
Total Timesteps: 390200 Episode Num: 1951 Reward: -312.5611553818514
timesteps not 0
Total Timesteps: 390400 Episode Num: 1952 Reward: -325.6779375163869
timesteps not 0
Total Timesteps: 390600 Episode Num: 1953 Reward: -252.98227953285644
timesteps not 0
Total 

timesteps not 0
Total Timesteps: 406200 Episode Num: 2031 Reward: -2.839236402273535
timesteps not 0
Total Timesteps: 406400 Episode Num: 2032 Reward: -128.67821851857173
timesteps not 0
Total Timesteps: 406600 Episode Num: 2033 Reward: -117.89197248810368
timesteps not 0
Total Timesteps: 406800 Episode Num: 2034 Reward: -122.37387755824847
timesteps not 0
Total Timesteps: 407000 Episode Num: 2035 Reward: -129.87644313091366
timesteps not 0
Total Timesteps: 407200 Episode Num: 2036 Reward: -128.59530229870168
timesteps not 0
Total Timesteps: 407400 Episode Num: 2037 Reward: -121.75766222952676
timesteps not 0
Total Timesteps: 407600 Episode Num: 2038 Reward: -231.1419325320075
timesteps not 0
Total Timesteps: 407800 Episode Num: 2039 Reward: -128.8423874624029
timesteps not 0
Total Timesteps: 408000 Episode Num: 2040 Reward: -336.3823909296596
timesteps not 0
Total Timesteps: 408200 Episode Num: 2041 Reward: -251.5329740170837
timesteps not 0
Total Timesteps: 408400 Episode Num: 2042 R

timesteps not 0
Total Timesteps: 424000 Episode Num: 2120 Reward: -1.3428263003050238
timesteps not 0
Total Timesteps: 424200 Episode Num: 2121 Reward: -122.48020155042207
timesteps not 0
Total Timesteps: 424400 Episode Num: 2122 Reward: -2.464217780211263
timesteps not 0
Total Timesteps: 424600 Episode Num: 2123 Reward: -3.9909708607776726
timesteps not 0
Total Timesteps: 424800 Episode Num: 2124 Reward: -117.9481024941555
timesteps not 0
Total Timesteps: 425000 Episode Num: 2125 Reward: -120.84245785761813
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -163.019528
--------------------------------------------------------
timesteps not 0
Total Timesteps: 425200 Episode Num: 2126 Reward: -221.8327075875191
timesteps not 0
Total Timesteps: 425400 Episode Num: 2127 Reward: -124.76393802017158
timesteps not 0
Total Timesteps: 425600 Episode Num: 2128 Reward: -220.94752331755564
timesteps not 0
Total

timesteps not 0
Total Timesteps: 441200 Episode Num: 2206 Reward: -118.44008466110382
timesteps not 0
Total Timesteps: 441400 Episode Num: 2207 Reward: -118.42992444222169
timesteps not 0
Total Timesteps: 441600 Episode Num: 2208 Reward: -117.88869144032668
timesteps not 0
Total Timesteps: 441800 Episode Num: 2209 Reward: -128.9932638026795
timesteps not 0
Total Timesteps: 442000 Episode Num: 2210 Reward: -116.88301874625921
timesteps not 0
Total Timesteps: 442200 Episode Num: 2211 Reward: -232.7548487256497
timesteps not 0
Total Timesteps: 442400 Episode Num: 2212 Reward: -120.33386332034078
timesteps not 0
Total Timesteps: 442600 Episode Num: 2213 Reward: -119.16638713039107
timesteps not 0
Total Timesteps: 442800 Episode Num: 2214 Reward: -121.01424608219727
timesteps not 0
Total Timesteps: 443000 Episode Num: 2215 Reward: -117.7532725843575
timesteps not 0
Total Timesteps: 443200 Episode Num: 2216 Reward: -116.28335907309628
timesteps not 0
Total Timesteps: 443400 Episode Num: 2217

timesteps not 0
Total Timesteps: 459000 Episode Num: 2295 Reward: -218.27805151763647
timesteps not 0
Total Timesteps: 459200 Episode Num: 2296 Reward: -0.5657590341407389
timesteps not 0
Total Timesteps: 459400 Episode Num: 2297 Reward: -116.30437457844826
timesteps not 0
Total Timesteps: 459600 Episode Num: 2298 Reward: -124.83563100979528
timesteps not 0
Total Timesteps: 459800 Episode Num: 2299 Reward: -120.69715745845703
timesteps not 0
Total Timesteps: 460000 Episode Num: 2300 Reward: -115.75043354534083
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -136.351860
--------------------------------------------------------
timesteps not 0
Total Timesteps: 460200 Episode Num: 2301 Reward: -219.96269496861888
timesteps not 0
Total Timesteps: 460400 Episode Num: 2302 Reward: -118.28017404168683
timesteps not 0
Total Timesteps: 460600 Episode Num: 2303 Reward: -118.79041055978333
timesteps not 0
To

timesteps not 0
Total Timesteps: 476200 Episode Num: 2381 Reward: -0.9208141125072994
timesteps not 0
Total Timesteps: 476400 Episode Num: 2382 Reward: -286.51135040902847
timesteps not 0
Total Timesteps: 476600 Episode Num: 2383 Reward: -2.872597810998436
timesteps not 0
Total Timesteps: 476800 Episode Num: 2384 Reward: -124.4811608499131
timesteps not 0
Total Timesteps: 477000 Episode Num: 2385 Reward: -218.89281785723358
timesteps not 0
Total Timesteps: 477200 Episode Num: 2386 Reward: -115.29030178192234
timesteps not 0
Total Timesteps: 477400 Episode Num: 2387 Reward: -124.12508618164411
timesteps not 0
Total Timesteps: 477600 Episode Num: 2388 Reward: -120.26325729437791
timesteps not 0
Total Timesteps: 477800 Episode Num: 2389 Reward: -118.74634311045406
timesteps not 0
Total Timesteps: 478000 Episode Num: 2390 Reward: -125.72777895008143
timesteps not 0
Total Timesteps: 478200 Episode Num: 2391 Reward: -309.78283427656146
timesteps not 0
Total Timesteps: 478400 Episode Num: 239

timesteps not 0
Total Timesteps: 494000 Episode Num: 2470 Reward: -116.50466556628673
timesteps not 0
Total Timesteps: 494200 Episode Num: 2471 Reward: -127.83694390691694
timesteps not 0
Total Timesteps: 494400 Episode Num: 2472 Reward: -246.52088704025604
timesteps not 0
Total Timesteps: 494600 Episode Num: 2473 Reward: -322.0994792660175
timesteps not 0
Total Timesteps: 494800 Episode Num: 2474 Reward: -117.0674328161748
timesteps not 0
Total Timesteps: 495000 Episode Num: 2475 Reward: -220.37543834603892
timesteps since eval more than eval freq
--------------------------------------------------------
Average Reward over the Evaluation Step: -148.130912
--------------------------------------------------------
timesteps not 0
Total Timesteps: 495200 Episode Num: 2476 Reward: -330.25806819020846
timesteps not 0
Total Timesteps: 495400 Episode Num: 2477 Reward: -1.003443458863845
timesteps not 0
Total Timesteps: 495600 Episode Num: 2478 Reward: -222.73606342115053
timesteps not 0
Total