In [1]:
#Imports
import gym
import tensorflow as tf
import numpy as np
import random

load_model           = True
environment_name     = "BipedalWalker-v2"
saver_file           = "./models/bipedal"
learning_rate        =.000001
frame_limit          = 400
discount_decay_rate  = .95
n_epochs             = 5001
random_action_probability_range  = [.05, .20]
random_action_repeat_steps_range = [ 5,  20]

    
def helper_discount_rewards(rewards, discount_rate):
    #Takes in rewards and applies discount rate
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards

    relevant_discounted_rewards = discounted_rewards

    reward_mean = relevant_discounted_rewards.mean()
    reward_std = relevant_discounted_rewards.std()

    return [(discounted_reward - reward_mean)/reward_std for discounted_reward in discounted_rewards]

# def helper_discount_rewards(rewards, discount_rate, begin_index, end_index):
#     #Takes in rewards and applies discount rate
#     discounted_rewards = []
    
#     for i in range(len(rewards)):
#         this_reward = 0.0
#         exponent = 1
#         for j in range(i,len(rewards)):
#             this_reward += rewards[j]*discount_rate**exponent
#             exponent += 1
#         discounted_rewards.append(this_reward)
#     return discounted_rewards
    
reward_test = [1,1,1,0,1,1,1]
print(helper_discount_rewards(reward_test, 0.9))



[1.5444718295843003, 0.9837496933802429, 0.36072509759795657, -0.3315244532712502, -0.08340961623176471, -0.8250074686376072, -1.6490050824218765]


In [2]:
# build the NN
reg_param = 0.00001

value_scale = 0.5  #how much does the value vs progressive gains influence the loss

activ = tf.nn.elu

n_inputs = 4
n_outputs = 4
n_obs_inputs = 24

n_hidden1 = 256
n_hidden2 = 256
n_hidden3 = 256
n_hidden4 = 256
n_hidden5 = 256
n_hidden6 = 128
n_hidden7 = 64
n_hidden8 = 32
initializer = tf.contrib.layers.variance_scaling_initializer()

tf.reset_default_graph()

tf_input_obs = tf.placeholder(tf.float32, shape=(None, n_obs_inputs))
tf_input_action = tf.placeholder(tf.float32, shape=(None, n_inputs))
tf_input_learning_rate = tf.placeholder(tf.float32)
tf_input_reward = tf.placeholder(tf.float32)

input_layer = tf.layers.dense(tf_input_obs, n_obs_inputs, activation=activ, name="input", kernel_initializer=initializer)

hidden1 = tf.layers.dense(input_layer, n_hidden1, activation=activ, name="hidden1", kernel_initializer=initializer)
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=activ, name="hidden2", kernel_initializer=initializer)
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=activ, name="hidden3", kernel_initializer=initializer)
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=activ, name="hidden4", kernel_initializer=initializer)
hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=activ, name="hidden5", kernel_initializer=initializer)
hidden6 = tf.layers.dense(hidden5, n_hidden6, activation=activ, name="hidden6", kernel_initializer=initializer)
hidden7 = tf.layers.dense(hidden6, n_hidden7, activation=activ, name="hidden7", kernel_initializer=initializer)
hidden8 = tf.layers.dense(hidden7, n_hidden8, activation=activ, name="hidden8", kernel_initializer=initializer)

logits = tf.layers.dense(hidden8, n_outputs, name="output")
#value  = tf.layers.dense(hidden8, 1,         name='value')

#pg_loss    = tf.reduce_mean(1.0 - tf_input_reward) * tf.reduce_mean(tf.square(logits-tf_input_action))
#value_loss = value_scale * tf.reduce_mean(tf.square(1.0- tf_input_reward))
#loss = pg_loss + value_loss

input_loss = tf.reduce_mean(tf.square(logits-tf_input_action))
#loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf_input_action)
#value_loss = 0.5 * tf.reduce_sum(tf.square(tf_input_reward - tf.reshape(value,[-1])))
reward_loss = 0.5 * tf_input_reward
loss = tf.reduce_sum(reward_loss + input_loss)
optimizer = tf.train.AdamOptimizer(tf_input_learning_rate)
training_op = optimizer.minimize(loss)

# regularization loss
#policy_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy_network")
        
# compute policy loss and regularization loss
# cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_input_action)
# pg_loss            = tf.reduce_mean(cross_entropy_loss)
# reg_loss           = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in policy_network_variables])
# loss               = pg_loss + reg_param * reg_loss

# gradients = optimizer.compute_gradients(loss)

# compute policy gradients
# for i, (grad, var) in enumerate(gradients):
#     if grad is not None:
#         gradients[i] = (grad * tf_input_reward, var)


# training_op = optimizer.apply_gradients(gradients)

init = tf.global_variables_initializer()
saver = tf.train.Saver()


In [3]:
env = gym.make(environment_name)
global_step = 0

with tf.Session() as sess:
    if load_model:
        print("Loading existing model before training: {}".format(saver_file))
        saver.restore(sess, saver_file)
    else:
        print("Creating new model before training: {}".format(saver_file))
        sess.run(init)
    
    for epoch in range(n_epochs):  #play n_epochs games
        game_step_counter = 0
        random_action_counter = 0
        
        all_actions = []
        all_rewards = []
        all_observations = []
        all_logits = []

        observation = env.reset()
        input_action = env.action_space.sample()  #set an initial input value
        
        while True:
            random_action_probability  = random.uniform(random_action_probability_range[0], random_action_probability_range[1])
            random_action_repeat_steps = random.randint(random_action_repeat_steps_range[0], random_action_repeat_steps_range[1])

            if epoch % 10 == 0:
                random_action_probability = 0.0
                
            #feed data into the AI to get action from the AI
            feed_dict = {tf_input_obs    : np.reshape(observation, (1, len(observation))), 
                         tf_input_action : np.reshape(input_action, (1, len(input_action))),
                         tf_input_learning_rate: 0.0,
                         tf_input_reward: 0.0}
            logits_out = sess.run([logits], feed_dict=feed_dict)

            # set the input value to feed into the next step
            
            if random_action_counter > 0 and random_action_counter < random_action_repeat_steps:
                random_action_counter += 1
            elif random_action_counter >= random_action_repeat_steps:
                random_action_counter = 0
            elif random.uniform(0.0, 1.0) < random_action_probability:
                input_action = env.action_space.sample()
                random_action_counter += 1
                #print("Random action {}: ".format(input_action))
            else:
                input_action = logits_out[0][0]

            #run the next step given the input from the logits
            observation, reward_float, done_bool, info_dict = env.step(input_action)
                
            #add the data to our lists
            all_observations.append(observation)
            all_logits.append(logits_out[0][0])
            all_actions.append(input_action)
            all_rewards.append(reward_float)

            if done_bool:
                break
                
            if game_step_counter > frame_limit:
                break

            if epoch % 1 == 0:
                env.render()  #display the current frame.
               
            game_step_counter += 1
        
        if epoch % 50 == 0 and epoch > 0:
            print("Saving model at epoch {}: {}".format(epoch, saver_file))
            saver.save(sess, saver_file)

        num_frames = len(all_actions)
        discounted_rewards = helper_discount_rewards(all_rewards, discount_decay_rate) #-1-frames_to_skip_end
        discounted_rewards_median = np.median(discounted_rewards)
        discounted_rewards_mean = np.mean(discounted_rewards)
        
        discounted_rewards_max = np.amax(discounted_rewards)
        
        loss_out_sum = 0.0
        all_losses = []
        all_loss_adders = []
        training_frame_counter = 0
        #use stored data to train
        for i in range(num_frames): 
            train_input_action = all_actions[i]
            
#             if discounted_rewards[i] + all_rewards[i] >= 1.0:
#                 train_input_action = all_actions[i]
#                 training_frame_counter += 1
#             else:
#                 continue
            
            loss_adder = discounted_rewards[i] - discounted_rewards_max
            if discounted_rewards[i] + all_rewards[i] >= 0:
                loss_adder = 0
    
            feed_dict = {tf_input_obs : np.reshape(all_observations[i], (1, len(all_observations[i]))), 
                         tf_input_action : np.reshape(train_input_action, (1, len(train_input_action))),
                         tf_input_learning_rate: learning_rate,
                         tf_input_reward: loss_adder}
        
            loss_out, _, logits_out = sess.run([loss, training_op, logits], feed_dict=feed_dict)
            loss_out_sum += abs(loss_out)
            all_losses.append(loss_out)
            all_loss_adders.append(loss_adder)

        print("Epoch: " + str(epoch) + ", rwd total: " + str(round(np.sum(all_rewards), 2)) + ", avg loss: " + str(round(loss_out_sum/num_frames, 2)) + ", rand prob: " + str(random_action_probability))

        #TODO - should learning rate decrease over time?
        print("")


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Loading existing model before training: ./models/bipedal
INFO:tensorflow:Restoring parameters from ./models/bipedal
Epoch: 0, rwd total: -107.2, avg loss: 0.25, rand prob: 0.0

Epoch: 1, rwd total: -115.71, avg loss: 0.35, rand prob: 0.1341494859239328

Epoch: 2, rwd total: -109.34, avg loss: 0.38, rand prob: 0.11466175438453255

Epoch: 3, rwd total: -107.24, avg loss: 0.47, rand prob: 0.08426870632714331

Epoch: 4, rwd total: -110.18, avg loss: 0.46, rand prob: 0.13657842438289816

Epoch: 5, rwd total: -106.33, avg loss: 0.42, rand prob: 0.0732504196770238

Epoch: 6, rwd total: -113.26, avg loss: 0.44, rand prob: 0.05678063517222935

Epoch: 7, rwd total: -136.47, avg loss: 0.31, rand prob: 0.0920195054801734

Epoch: 8, rwd total: -95.16, avg loss: 0.32, rand prob: 0.051119

Epoch: 102, rwd total: -103.18, avg loss: 0.44, rand prob: 0.12495029490380555

Epoch: 103, rwd total: -135.11, avg loss: 0.32, rand prob: 0.07633786743923394

Epoch: 104, rwd total: -105.1, avg loss: 0.32, rand prob: 0.10574704179027099

Epoch: 105, rwd total: -111.5, avg loss: 0.44, rand prob: 0.05278783856016184

Epoch: 106, rwd total: -106.37, avg loss: 0.34, rand prob: 0.11360058698692041

Epoch: 107, rwd total: -117.74, avg loss: 0.47, rand prob: 0.11289331324569749

Epoch: 108, rwd total: -113.18, avg loss: 0.52, rand prob: 0.06214242240791516

Epoch: 109, rwd total: -112.97, avg loss: 0.44, rand prob: 0.14861370159176987

Epoch: 110, rwd total: -107.75, avg loss: 0.24, rand prob: 0.0

Epoch: 111, rwd total: -120.89, avg loss: 0.29, rand prob: 0.11690563512904122

Epoch: 112, rwd total: -115.36, avg loss: 0.4, rand prob: 0.0977187545185573

Epoch: 113, rwd total: -118.87, avg loss: 0.28, rand prob: 0.19873877270830198

Epoch: 114, rwd total: -118.16, avg loss: 0.47, rand prob: 0

Epoch: 206, rwd total: -106.06, avg loss: 0.38, rand prob: 0.1993479953902973

Epoch: 207, rwd total: -123.72, avg loss: 0.35, rand prob: 0.07560063972233788

Epoch: 208, rwd total: -113.19, avg loss: 0.49, rand prob: 0.10365652303674819

Epoch: 209, rwd total: -119.71, avg loss: 0.4, rand prob: 0.1974094391920026

Epoch: 210, rwd total: -128.57, avg loss: 0.24, rand prob: 0.0

Epoch: 211, rwd total: -125.37, avg loss: 0.26, rand prob: 0.17996905452504375

Epoch: 212, rwd total: -121.64, avg loss: 0.39, rand prob: 0.16487988493724087

Epoch: 213, rwd total: -100.83, avg loss: 0.33, rand prob: 0.1970467176594876

Epoch: 214, rwd total: -120.71, avg loss: 0.3, rand prob: 0.13075109067572976

Epoch: 215, rwd total: -109.53, avg loss: 0.34, rand prob: 0.12302832639963047

Epoch: 216, rwd total: -106.21, avg loss: 0.47, rand prob: 0.165744235255711

Epoch: 217, rwd total: -101.22, avg loss: 0.32, rand prob: 0.19193363737142943

Epoch: 218, rwd total: -102.81, avg loss: 0.29, rand prob: 0.07

Epoch: 310, rwd total: -130.14, avg loss: 0.23, rand prob: 0.0

Epoch: 311, rwd total: -106.44, avg loss: 0.32, rand prob: 0.1551153719391609

Epoch: 312, rwd total: -122.34, avg loss: 0.38, rand prob: 0.11039976872986616

Epoch: 313, rwd total: -102.69, avg loss: 0.26, rand prob: 0.11504174836708032

Epoch: 314, rwd total: -128.03, avg loss: 0.32, rand prob: 0.19476961465321274

Epoch: 315, rwd total: -101.63, avg loss: 0.3, rand prob: 0.11191237846395648

Epoch: 316, rwd total: -106.44, avg loss: 0.39, rand prob: 0.10621053930330894

Epoch: 317, rwd total: -111.55, avg loss: 0.35, rand prob: 0.12097637032658748

Epoch: 318, rwd total: -121.95, avg loss: 0.43, rand prob: 0.08663240545866681

Epoch: 319, rwd total: -114.96, avg loss: 0.41, rand prob: 0.17072165277580323

Epoch: 320, rwd total: -127.53, avg loss: 0.27, rand prob: 0.0

Epoch: 321, rwd total: -123.83, avg loss: 0.31, rand prob: 0.11444007709082923

Epoch: 322, rwd total: -106.95, avg loss: 0.46, rand prob: 0.0685532102339

Epoch: 415, rwd total: -122.71, avg loss: 0.31, rand prob: 0.12536767130625862

Epoch: 416, rwd total: -113.52, avg loss: 0.35, rand prob: 0.06911467034175002

Epoch: 417, rwd total: -118.71, avg loss: 0.42, rand prob: 0.12339894294097281

Epoch: 418, rwd total: -116.18, avg loss: 0.33, rand prob: 0.1758188991075823

Epoch: 419, rwd total: -12.7, avg loss: 1.1, rand prob: 0.13264203933028268

Epoch: 420, rwd total: -105.8, avg loss: 0.36, rand prob: 0.0

Epoch: 421, rwd total: -103.43, avg loss: 0.32, rand prob: 0.05051090199037189

Epoch: 422, rwd total: -98.67, avg loss: 0.41, rand prob: 0.12583595713747042

Epoch: 423, rwd total: -108.75, avg loss: 0.4, rand prob: 0.097416195259944

Epoch: 424, rwd total: -103.69, avg loss: 0.25, rand prob: 0.10830083149137353

Epoch: 425, rwd total: -109.76, avg loss: 0.42, rand prob: 0.08694867776452164

Epoch: 426, rwd total: -99.51, avg loss: 0.29, rand prob: 0.07575506563214097

Epoch: 427, rwd total: -118.13, avg loss: 0.45, rand prob: 0.14659

Epoch: 520, rwd total: -98.26, avg loss: 0.28, rand prob: 0.0

Epoch: 521, rwd total: -122.19, avg loss: 0.44, rand prob: 0.1163732872713453

Epoch: 522, rwd total: -102.95, avg loss: 0.42, rand prob: 0.1932182903498451

Epoch: 523, rwd total: -100.43, avg loss: 0.4, rand prob: 0.08878737351375882

Epoch: 524, rwd total: -100.16, avg loss: 0.29, rand prob: 0.18435748512290534

Epoch: 525, rwd total: -100.45, avg loss: 0.44, rand prob: 0.1700060997556203

Epoch: 526, rwd total: -104.58, avg loss: 0.43, rand prob: 0.17779255990545412

Epoch: 527, rwd total: -112.24, avg loss: 0.52, rand prob: 0.17874949683836244

Epoch: 528, rwd total: -124.29, avg loss: 0.29, rand prob: 0.1330402013601949

Epoch: 529, rwd total: -101.49, avg loss: 0.43, rand prob: 0.056957348478391394

Epoch: 530, rwd total: -99.35, avg loss: 0.41, rand prob: 0.0

Epoch: 531, rwd total: -108.58, avg loss: 0.45, rand prob: 0.07471261516791

Epoch: 532, rwd total: -109.9, avg loss: 0.42, rand prob: 0.0957329166429905

Epo

Epoch: 625, rwd total: -115.42, avg loss: 0.4, rand prob: 0.13651599643639026

Epoch: 626, rwd total: -108.9, avg loss: 0.27, rand prob: 0.17055149139845763

Epoch: 627, rwd total: -101.91, avg loss: 0.35, rand prob: 0.11365438123906735

Epoch: 628, rwd total: -25.87, avg loss: 0.93, rand prob: 0.08813276140663712

Epoch: 629, rwd total: -104.9, avg loss: 0.29, rand prob: 0.050184797042832595

Epoch: 630, rwd total: -100.15, avg loss: 0.31, rand prob: 0.0

Epoch: 631, rwd total: -110.04, avg loss: 0.46, rand prob: 0.08340172363026693

Epoch: 632, rwd total: -114.85, avg loss: 0.47, rand prob: 0.19546963275970353

Epoch: 633, rwd total: -112.88, avg loss: 0.48, rand prob: 0.1705142867673355

Epoch: 634, rwd total: -124.84, avg loss: 0.31, rand prob: 0.0672073078257792

Epoch: 635, rwd total: -136.21, avg loss: 0.33, rand prob: 0.09974225275845566

Epoch: 636, rwd total: -109.86, avg loss: 0.46, rand prob: 0.19033828825087823

Epoch: 637, rwd total: -109.2, avg loss: 0.34, rand prob: 0.0

Epoch: 729, rwd total: -107.27, avg loss: 0.44, rand prob: 0.192606187580455

Epoch: 730, rwd total: -100.54, avg loss: 0.38, rand prob: 0.0

Epoch: 731, rwd total: -123.12, avg loss: 0.46, rand prob: 0.06726359172808802

Epoch: 732, rwd total: -108.5, avg loss: 0.49, rand prob: 0.06409330815503933

Epoch: 733, rwd total: -110.7, avg loss: 0.36, rand prob: 0.1083590503638232

Epoch: 734, rwd total: -101.88, avg loss: 0.36, rand prob: 0.12290925244745463

Epoch: 735, rwd total: -103.35, avg loss: 0.38, rand prob: 0.11957556775584868

Epoch: 736, rwd total: -101.03, avg loss: 0.38, rand prob: 0.1324408252874893

Epoch: 737, rwd total: -119.32, avg loss: 0.39, rand prob: 0.0783692608191666

Epoch: 738, rwd total: -102.05, avg loss: 0.38, rand prob: 0.11975700120240984

Epoch: 739, rwd total: -100.73, avg loss: 0.4, rand prob: 0.14216283173071945

Epoch: 740, rwd total: -108.59, avg loss: 0.24, rand prob: 0.0

Epoch: 741, rwd total: -109.8, avg loss: 0.53, rand prob: 0.14876952884491063

E

Epoch: 834, rwd total: -110.41, avg loss: 0.45, rand prob: 0.17591413857792965

Epoch: 835, rwd total: -116.86, avg loss: 0.31, rand prob: 0.16888314149360303

Epoch: 836, rwd total: -111.98, avg loss: 0.3, rand prob: 0.18510357198263727

Epoch: 837, rwd total: -120.84, avg loss: 0.37, rand prob: 0.07397600037280413

Epoch: 838, rwd total: -127.79, avg loss: 0.31, rand prob: 0.11433951538870307

Epoch: 839, rwd total: -113.93, avg loss: 0.39, rand prob: 0.08854834324401994

Epoch: 840, rwd total: -19.02, avg loss: 0.21, rand prob: 0.0

Epoch: 841, rwd total: -101.35, avg loss: 0.41, rand prob: 0.11751710465195904

Epoch: 842, rwd total: -98.98, avg loss: 0.4, rand prob: 0.11086194767916452

Epoch: 843, rwd total: -102.52, avg loss: 0.47, rand prob: 0.07459231509027509

Epoch: 844, rwd total: -92.69, avg loss: 0.31, rand prob: 0.14989546794612973

Epoch: 845, rwd total: -105.23, avg loss: 0.48, rand prob: 0.14856565961277834

Epoch: 846, rwd total: -125.41, avg loss: 0.47, rand prob: 0.

Epoch: 938, rwd total: -106.73, avg loss: 0.46, rand prob: 0.12825756828982088

Epoch: 939, rwd total: -99.5, avg loss: 0.4, rand prob: 0.17715252341773163

Epoch: 940, rwd total: -98.54, avg loss: 0.37, rand prob: 0.0

Epoch: 941, rwd total: -100.24, avg loss: 0.39, rand prob: 0.08270859805294811

Epoch: 942, rwd total: -102.43, avg loss: 0.44, rand prob: 0.16780479798379194

Epoch: 943, rwd total: -100.66, avg loss: 0.38, rand prob: 0.1783233358755852

Epoch: 944, rwd total: -87.24, avg loss: 0.28, rand prob: 0.10983890875059735

Epoch: 945, rwd total: -125.63, avg loss: 0.37, rand prob: 0.11549620300646117

Epoch: 946, rwd total: -123.06, avg loss: 0.34, rand prob: 0.09946667208127799

Epoch: 947, rwd total: -124.91, avg loss: 0.31, rand prob: 0.0698224590328024

Epoch: 948, rwd total: -122.27, avg loss: 0.34, rand prob: 0.1335925240927291

Epoch: 949, rwd total: -106.49, avg loss: 0.39, rand prob: 0.12715130140676315

Saving model at epoch 950: ./models/bipedal
Epoch: 950, rwd tota

Epoch: 1042, rwd total: -99.06, avg loss: 0.38, rand prob: 0.17662927505852338

Epoch: 1043, rwd total: -120.93, avg loss: 0.43, rand prob: 0.0606712375957373

Epoch: 1044, rwd total: -123.56, avg loss: 0.45, rand prob: 0.0542632150267411

Epoch: 1045, rwd total: -107.66, avg loss: 0.42, rand prob: 0.05637773973153872

Epoch: 1046, rwd total: -93.8, avg loss: 0.28, rand prob: 0.058072346229923126

Epoch: 1047, rwd total: -111.14, avg loss: 0.35, rand prob: 0.16571250927956027

Epoch: 1048, rwd total: -11.65, avg loss: 0.92, rand prob: 0.12834853814728744

Epoch: 1049, rwd total: -122.97, avg loss: 0.33, rand prob: 0.09391759586625303

Saving model at epoch 1050: ./models/bipedal
Epoch: 1050, rwd total: -98.33, avg loss: 0.37, rand prob: 0.0

Epoch: 1051, rwd total: -100.13, avg loss: 0.35, rand prob: 0.051041521993333505

Epoch: 1052, rwd total: -108.4, avg loss: 0.5, rand prob: 0.07204984372962425

Epoch: 1053, rwd total: -116.28, avg loss: 0.41, rand prob: 0.12391847124925454

Epoch:

Epoch: 1145, rwd total: -110.93, avg loss: 0.41, rand prob: 0.1332231533359277

Epoch: 1146, rwd total: -98.78, avg loss: 0.38, rand prob: 0.14884260849652703

Epoch: 1147, rwd total: -105.34, avg loss: 0.37, rand prob: 0.06010287092860997

Epoch: 1148, rwd total: -109.16, avg loss: 0.55, rand prob: 0.19232854161612312

Epoch: 1149, rwd total: -108.22, avg loss: 0.4, rand prob: 0.14520286923716058

Saving model at epoch 1150: ./models/bipedal
Epoch: 1150, rwd total: -127.62, avg loss: 0.25, rand prob: 0.0

Epoch: 1151, rwd total: -109.39, avg loss: 0.32, rand prob: 0.10016918479198683

Epoch: 1152, rwd total: -105.87, avg loss: 0.4, rand prob: 0.18981548339494891

Epoch: 1153, rwd total: -121.26, avg loss: 0.4, rand prob: 0.10067402671116878

Epoch: 1154, rwd total: -13.14, avg loss: 1.05, rand prob: 0.17143281861811177

Epoch: 1155, rwd total: -123.79, avg loss: 0.39, rand prob: 0.17480827565628326

Epoch: 1156, rwd total: -97.16, avg loss: 0.36, rand prob: 0.1352701565239003

Epoch: 

Epoch: 1248, rwd total: -101.75, avg loss: 0.37, rand prob: 0.18279461382657713

Epoch: 1249, rwd total: -100.98, avg loss: 0.39, rand prob: 0.11826130015187139

Saving model at epoch 1250: ./models/bipedal
Epoch: 1250, rwd total: -98.45, avg loss: 0.37, rand prob: 0.0

Epoch: 1251, rwd total: -101.54, avg loss: 0.38, rand prob: 0.1309722545272327

Epoch: 1252, rwd total: -101.18, avg loss: 0.41, rand prob: 0.199213883685264

Epoch: 1253, rwd total: -103.38, avg loss: 0.46, rand prob: 0.08695102652289996

Epoch: 1254, rwd total: -103.9, avg loss: 0.41, rand prob: 0.08195060825200931

Epoch: 1255, rwd total: -129.05, avg loss: 0.28, rand prob: 0.06985748134091717

Epoch: 1256, rwd total: -108.27, avg loss: 0.39, rand prob: 0.17331269364249116

Epoch: 1257, rwd total: -107.8, avg loss: 0.4, rand prob: 0.13771177252307418

Epoch: 1258, rwd total: -113.59, avg loss: 0.49, rand prob: 0.11872722830048218

Epoch: 1259, rwd total: -104.68, avg loss: 0.41, rand prob: 0.142766884208879

Epoch: 1

Epoch: 1351, rwd total: -104.2, avg loss: 0.4, rand prob: 0.10177202913868515

Epoch: 1352, rwd total: -106.52, avg loss: 0.44, rand prob: 0.07455872016958463

Epoch: 1353, rwd total: -112.78, avg loss: 0.4, rand prob: 0.08295964094096145

Epoch: 1354, rwd total: -99.57, avg loss: 0.42, rand prob: 0.10585238165335142

Epoch: 1355, rwd total: -114.01, avg loss: 0.42, rand prob: 0.05153361367650452

Epoch: 1356, rwd total: -110.19, avg loss: 0.3, rand prob: 0.08666771057173028

Epoch: 1357, rwd total: -119.96, avg loss: 0.29, rand prob: 0.06541691570154479

Epoch: 1358, rwd total: -101.37, avg loss: 0.34, rand prob: 0.13458921886259045

Epoch: 1359, rwd total: -123.69, avg loss: 0.37, rand prob: 0.12275780150434687

Epoch: 1360, rwd total: -97.73, avg loss: 0.26, rand prob: 0.0

Epoch: 1361, rwd total: -103.89, avg loss: 0.46, rand prob: 0.12433008514422689

Epoch: 1362, rwd total: -105.15, avg loss: 0.42, rand prob: 0.07871036035749632

Epoch: 1363, rwd total: -98.55, avg loss: 0.37, ra

Epoch: 1454, rwd total: -101.3, avg loss: 0.4, rand prob: 0.1839495337113648

Epoch: 1455, rwd total: -118.1, avg loss: 0.28, rand prob: 0.11678119589975076

Epoch: 1456, rwd total: -127.19, avg loss: 0.39, rand prob: 0.1392238598411839

Epoch: 1457, rwd total: -106.11, avg loss: 0.46, rand prob: 0.08773782221628137

Epoch: 1458, rwd total: -107.41, avg loss: 0.45, rand prob: 0.05438681227427527

Epoch: 1459, rwd total: -110.85, avg loss: 0.49, rand prob: 0.13535872089814832

Epoch: 1460, rwd total: -99.51, avg loss: 0.41, rand prob: 0.0

Epoch: 1461, rwd total: -101.44, avg loss: 0.41, rand prob: 0.1701677525193182

Epoch: 1462, rwd total: -104.7, avg loss: 0.44, rand prob: 0.16156632316948427

Epoch: 1463, rwd total: -106.04, avg loss: 0.41, rand prob: 0.05802702915193052

Epoch: 1464, rwd total: -110.18, avg loss: 0.27, rand prob: 0.11446163509348373

Epoch: 1465, rwd total: -109.52, avg loss: 0.49, rand prob: 0.15956497107036643

Epoch: 1466, rwd total: -98.8, avg loss: 0.35, rand 

Epoch: 1557, rwd total: -96.85, avg loss: 0.33, rand prob: 0.1127558349438115

Epoch: 1558, rwd total: -120.79, avg loss: 0.4, rand prob: 0.14880808976647375

Epoch: 1559, rwd total: -104.36, avg loss: 0.41, rand prob: 0.09735291994662804

Epoch: 1560, rwd total: -24.75, avg loss: 0.42, rand prob: 0.0

Epoch: 1561, rwd total: -99.23, avg loss: 0.38, rand prob: 0.12724050962910188

Epoch: 1562, rwd total: -98.61, avg loss: 0.32, rand prob: 0.14144122285263466

Epoch: 1563, rwd total: -104.66, avg loss: 0.44, rand prob: 0.18066198983197723

Epoch: 1564, rwd total: -106.36, avg loss: 0.4, rand prob: 0.05610969736528254

Epoch: 1565, rwd total: -98.86, avg loss: 0.39, rand prob: 0.1688157276081212

Epoch: 1566, rwd total: -106.91, avg loss: 0.42, rand prob: 0.14620626222131372

Epoch: 1567, rwd total: -105.27, avg loss: 0.47, rand prob: 0.1745288197627906

Epoch: 1568, rwd total: -112.42, avg loss: 0.53, rand prob: 0.19930056188893686

Epoch: 1569, rwd total: -101.16, avg loss: 0.42, rand 

Epoch: 1660, rwd total: -99.53, avg loss: 0.28, rand prob: 0.0

Epoch: 1661, rwd total: -120.94, avg loss: 0.39, rand prob: 0.1157114281877444

Epoch: 1662, rwd total: -113.19, avg loss: 0.4, rand prob: 0.102916587335973

Epoch: 1663, rwd total: -120.44, avg loss: 0.43, rand prob: 0.173494036269811

Epoch: 1664, rwd total: -121.79, avg loss: 0.28, rand prob: 0.12260578682571137

Epoch: 1665, rwd total: -123.49, avg loss: 0.49, rand prob: 0.12863875963711036

Epoch: 1666, rwd total: -108.92, avg loss: 0.32, rand prob: 0.14058667493320223

Epoch: 1667, rwd total: -119.73, avg loss: 0.28, rand prob: 0.10868513138712105

Epoch: 1668, rwd total: -101.47, avg loss: 0.38, rand prob: 0.10320847844124312

Epoch: 1669, rwd total: -122.99, avg loss: 0.35, rand prob: 0.12160557249263983

Epoch: 1670, rwd total: -128.35, avg loss: 0.28, rand prob: 0.0

Epoch: 1671, rwd total: -124.07, avg loss: 0.31, rand prob: 0.14237351066382467

Epoch: 1672, rwd total: -123.31, avg loss: 0.33, rand prob: 0.16016

Epoch: 1763, rwd total: -138.72, avg loss: 0.35, rand prob: 0.13262148658243555

Epoch: 1764, rwd total: -97.89, avg loss: 0.39, rand prob: 0.1459940699337922

Epoch: 1765, rwd total: -105.53, avg loss: 0.31, rand prob: 0.05105922943631446

Epoch: 1766, rwd total: -112.65, avg loss: 0.42, rand prob: 0.1140897056082647

Epoch: 1767, rwd total: -101.24, avg loss: 0.31, rand prob: 0.05976142999332672

Epoch: 1768, rwd total: -99.82, avg loss: 0.38, rand prob: 0.07559115376265509

Epoch: 1769, rwd total: -103.4, avg loss: 0.43, rand prob: 0.10205835114621802

Epoch: 1770, rwd total: -97.92, avg loss: 0.27, rand prob: 0.0

Epoch: 1771, rwd total: -117.51, avg loss: 0.41, rand prob: 0.08165216753933763

Epoch: 1772, rwd total: -126.24, avg loss: 0.34, rand prob: 0.14148886852546272

Epoch: 1773, rwd total: -123.92, avg loss: 0.43, rand prob: 0.12054735952595352

Epoch: 1774, rwd total: -118.37, avg loss: 0.4, rand prob: 0.13017850728825597

Epoch: 1775, rwd total: -100.25, avg loss: 0.31, ra

Epoch: 1866, rwd total: -121.43, avg loss: 0.34, rand prob: 0.18941595267241423

Epoch: 1867, rwd total: -113.51, avg loss: 0.42, rand prob: 0.08942786904175204

Epoch: 1868, rwd total: -106.86, avg loss: 0.43, rand prob: 0.18155353957538717

Epoch: 1869, rwd total: -116.18, avg loss: 47698.9, rand prob: 0.0888081144959888

Epoch: 1870, rwd total: -98.3, avg loss: 0.29, rand prob: 0.0

Epoch: 1871, rwd total: -117.38, avg loss: 0.48, rand prob: 0.14380895473874514

Epoch: 1872, rwd total: -120.57, avg loss: 0.37, rand prob: 0.12881329931490149

Epoch: 1873, rwd total: -134.1, avg loss: 0.33, rand prob: 0.17594537130592874

Epoch: 1874, rwd total: -111.43, avg loss: 0.32, rand prob: 0.1424723323251296

Epoch: 1875, rwd total: -123.09, avg loss: 0.3, rand prob: 0.08611627617801973

Epoch: 1876, rwd total: -125.07, avg loss: 0.33, rand prob: 0.14996854407496035

Epoch: 1877, rwd total: -97.73, avg loss: 0.28, rand prob: 0.11416527850400257

Epoch: 1878, rwd total: -106.17, avg loss: 0.47,

Epoch: 1969, rwd total: -130.6, avg loss: 0.42, rand prob: 0.11282991825851046

Epoch: 1970, rwd total: -97.86, avg loss: 0.28, rand prob: 0.0

Epoch: 1971, rwd total: -107.54, avg loss: 0.43, rand prob: 0.09238158193647003

Epoch: 1972, rwd total: -108.41, avg loss: 0.32, rand prob: 0.050627551897770434

Epoch: 1973, rwd total: -113.34, avg loss: 0.43, rand prob: 0.07466187077493006

Epoch: 1974, rwd total: -22.73, avg loss: 0.88, rand prob: 0.12015699305133666

Epoch: 1975, rwd total: -107.29, avg loss: 0.44, rand prob: 0.1523001925298418

Epoch: 1976, rwd total: -25.39, avg loss: 0.82, rand prob: 0.054235902267412685

Epoch: 1977, rwd total: -96.63, avg loss: 0.33, rand prob: 0.06832798842883839

Epoch: 1978, rwd total: -99.47, avg loss: 0.35, rand prob: 0.15463583256416796

Epoch: 1979, rwd total: -108.58, avg loss: 0.39, rand prob: 0.11846445602347502

Epoch: 1980, rwd total: -107.14, avg loss: 0.26, rand prob: 0.0

Epoch: 1981, rwd total: -113.49, avg loss: 0.36, rand prob: 0.082

Epoch: 2072, rwd total: -130.75, avg loss: 0.42, rand prob: 0.1568091249954185

Epoch: 2073, rwd total: -109.26, avg loss: 0.41, rand prob: 0.11131625576867166

Epoch: 2074, rwd total: -125.65, avg loss: 0.49, rand prob: 0.10660708807911096

Epoch: 2075, rwd total: -106.6, avg loss: 0.37, rand prob: 0.1476694838903134

Epoch: 2076, rwd total: -92.98, avg loss: 0.29, rand prob: 0.08134253186438406

Epoch: 2077, rwd total: -16.42, avg loss: 0.82, rand prob: 0.15997183519773348

Epoch: 2078, rwd total: -110.38, avg loss: 0.36, rand prob: 0.10409543944282335

Epoch: 2079, rwd total: -129.69, avg loss: 0.44, rand prob: 0.06426487631161164

Epoch: 2080, rwd total: -127.18, avg loss: 0.28, rand prob: 0.0

Epoch: 2081, rwd total: -95.1, avg loss: 0.32, rand prob: 0.08568610880500355

Epoch: 2082, rwd total: -107.78, avg loss: 0.38, rand prob: 0.17720533806860644

Epoch: 2083, rwd total: -134.01, avg loss: 0.26, rand prob: 0.15814008401938395

Epoch: 2084, rwd total: -30.28, avg loss: 0.87, ran

Epoch: 2175, rwd total: -119.95, avg loss: 0.32, rand prob: 0.14172162628625581

Epoch: 2176, rwd total: -105.95, avg loss: 0.37, rand prob: 0.1823657922982993

Epoch: 2177, rwd total: -112.88, avg loss: 0.45, rand prob: 0.17486038684348826

Epoch: 2178, rwd total: -111.58, avg loss: 0.47, rand prob: 0.13784220833285293

Epoch: 2179, rwd total: -110.99, avg loss: 0.51, rand prob: 0.14533162648543338

Epoch: 2180, rwd total: -97.09, avg loss: 0.25, rand prob: 0.0

Epoch: 2181, rwd total: -40.05, avg loss: 0.66, rand prob: 0.13068162874112077

Epoch: 2182, rwd total: -115.79, avg loss: 0.34, rand prob: 0.10996782296415282

Epoch: 2183, rwd total: -116.07, avg loss: 0.4, rand prob: 0.19602277370734944

Epoch: 2184, rwd total: -118.35, avg loss: 0.4, rand prob: 0.07467363463342498

Epoch: 2185, rwd total: -119.18, avg loss: 0.36, rand prob: 0.0992726249095061

Epoch: 2186, rwd total: -114.06, avg loss: 0.37, rand prob: 0.10264371081463888

Epoch: 2187, rwd total: -113.14, avg loss: 0.47, r

Epoch: 2278, rwd total: -114.6, avg loss: 0.49, rand prob: 0.11971587759596368

Epoch: 2279, rwd total: -105.25, avg loss: 0.41, rand prob: 0.15095646472578417

Epoch: 2280, rwd total: -129.4, avg loss: 0.24, rand prob: 0.0

Epoch: 2281, rwd total: -106.62, avg loss: 0.42, rand prob: 0.16794889169051824

Epoch: 2282, rwd total: -113.39, avg loss: 0.46, rand prob: 0.19904152706916212

Epoch: 2283, rwd total: -121.26, avg loss: 0.54, rand prob: 0.0748577234045765

Epoch: 2284, rwd total: -106.64, avg loss: 0.47, rand prob: 0.1293369723919816

Epoch: 2285, rwd total: -118.87, avg loss: 0.51, rand prob: 0.0962496672935295

Epoch: 2286, rwd total: -108.56, avg loss: 0.41, rand prob: 0.14379829640898056

Epoch: 2287, rwd total: -102.49, avg loss: 0.35, rand prob: 0.0657067700412109

Epoch: 2288, rwd total: -101.86, avg loss: 0.41, rand prob: 0.05984943723764684

Epoch: 2289, rwd total: -101.86, avg loss: 0.4, rand prob: 0.13783092526942425

Epoch: 2290, rwd total: -100.78, avg loss: 0.39, ra

Epoch: 2381, rwd total: -104.09, avg loss: 0.32, rand prob: 0.12885919854051536

Epoch: 2382, rwd total: -130.56, avg loss: 0.35, rand prob: 0.05684108193014739

Epoch: 2383, rwd total: -108.95, avg loss: 0.31, rand prob: 0.08680966108009588

Epoch: 2384, rwd total: -134.72, avg loss: 0.28, rand prob: 0.10343619344703256

Epoch: 2385, rwd total: -109.23, avg loss: 0.41, rand prob: 0.08440077344587288

Epoch: 2386, rwd total: -121.28, avg loss: 0.52, rand prob: 0.10834542999103788

Epoch: 2387, rwd total: -132.85, avg loss: 0.33, rand prob: 0.13922485827036957

Epoch: 2388, rwd total: -104.99, avg loss: 0.35, rand prob: 0.10780956258586032

Epoch: 2389, rwd total: -112.32, avg loss: 0.41, rand prob: 0.16837723767243618

Epoch: 2390, rwd total: -126.67, avg loss: 0.27, rand prob: 0.0

Epoch: 2391, rwd total: -110.12, avg loss: 0.35, rand prob: 0.13376356687203628

Epoch: 2392, rwd total: -120.94, avg loss: 0.29, rand prob: 0.13791247822540753

Epoch: 2393, rwd total: -108.28, avg loss: 0

Epoch: 2484, rwd total: -101.54, avg loss: 0.39, rand prob: 0.1964554464976267

Epoch: 2485, rwd total: -103.95, avg loss: 0.43, rand prob: 0.08439511510295224

Epoch: 2486, rwd total: -104.29, avg loss: 0.4, rand prob: 0.18437623989663038

Epoch: 2487, rwd total: -104.36, avg loss: 0.35, rand prob: 0.09861998569446767

Epoch: 2488, rwd total: -141.19, avg loss: 0.33, rand prob: 0.16615368614508796

Epoch: 2489, rwd total: -111.59, avg loss: 0.39, rand prob: 0.19652008868782955

Epoch: 2490, rwd total: -98.61, avg loss: 0.37, rand prob: 0.0

Epoch: 2491, rwd total: -100.66, avg loss: 0.35, rand prob: 0.17097588592123786

Epoch: 2492, rwd total: -103.58, avg loss: 0.37, rand prob: 0.1389815341958946

Epoch: 2493, rwd total: -96.09, avg loss: 0.35, rand prob: 0.06710887130429019

Epoch: 2494, rwd total: -105.99, avg loss: 0.39, rand prob: 0.08400970439063093

Epoch: 2495, rwd total: -103.18, avg loss: 0.41, rand prob: 0.1518929952007048

Epoch: 2496, rwd total: -12.83, avg loss: 1.1, ran

Epoch: 2587, rwd total: -113.33, avg loss: 0.48, rand prob: 0.09782713628201581

Epoch: 2588, rwd total: -118.22, avg loss: 0.32, rand prob: 0.12904533120651848

Epoch: 2589, rwd total: -101.36, avg loss: 0.3, rand prob: 0.12632771515457594

Epoch: 2590, rwd total: -107.55, avg loss: 0.26, rand prob: 0.0

Epoch: 2591, rwd total: -116.92, avg loss: 0.31, rand prob: 0.12076862643087527

Epoch: 2592, rwd total: -100.04, avg loss: 0.39, rand prob: 0.05087502239591123

Epoch: 2593, rwd total: -113.4, avg loss: 0.3, rand prob: 0.08236003369454908

Epoch: 2594, rwd total: -106.78, avg loss: 0.43, rand prob: 0.1502444008088056

Epoch: 2595, rwd total: -108.1, avg loss: 0.45, rand prob: 0.07638574492772954

Epoch: 2596, rwd total: -134.84, avg loss: 0.46, rand prob: 0.14694670864172832

Epoch: 2597, rwd total: -135.86, avg loss: 0.27, rand prob: 0.09940709633220873

Epoch: 2598, rwd total: -117.87, avg loss: 0.43, rand prob: 0.15435508325881178

Epoch: 2599, rwd total: -116.62, avg loss: 0.44, 

Epoch: 2690, rwd total: -98.58, avg loss: 0.37, rand prob: 0.0

Epoch: 2691, rwd total: -104.49, avg loss: 0.38, rand prob: 0.15058878098364886

Epoch: 2692, rwd total: -102.16, avg loss: 0.38, rand prob: 0.07974562389125164

Epoch: 2693, rwd total: -16.33, avg loss: 0.9, rand prob: 0.15146215833771576

Epoch: 2694, rwd total: -108.37, avg loss: 0.48, rand prob: 0.1648337888339163

Epoch: 2695, rwd total: -103.9, avg loss: 0.42, rand prob: 0.1295852920960809

Epoch: 2696, rwd total: -106.71, avg loss: 0.45, rand prob: 0.1435484250237019

Epoch: 2697, rwd total: -100.72, avg loss: 0.39, rand prob: 0.10605963665658132

Epoch: 2698, rwd total: -109.84, avg loss: 0.29, rand prob: 0.11593442220150386

Epoch: 2699, rwd total: -109.38, avg loss: 0.45, rand prob: 0.15665956646259707

Saving model at epoch 2700: ./models/bipedal
Epoch: 2700, rwd total: -98.65, avg loss: 0.37, rand prob: 0.0

Epoch: 2701, rwd total: -105.97, avg loss: 0.37, rand prob: 0.07679795861383576

Epoch: 2702, rwd total:

Epoch: 2793, rwd total: -118.88, avg loss: 0.34, rand prob: 0.17013073573576293

Epoch: 2794, rwd total: -139.53, avg loss: 0.27, rand prob: 0.07845363928535194

Epoch: 2795, rwd total: -96.3, avg loss: 0.34, rand prob: 0.17934454106898412

Epoch: 2796, rwd total: -109.12, avg loss: 0.36, rand prob: 0.07267154506579523

Epoch: 2797, rwd total: -108.18, avg loss: 0.27, rand prob: 0.06967658809197017

Epoch: 2798, rwd total: -21.22, avg loss: 0.81, rand prob: 0.11101945263625923

Epoch: 2799, rwd total: -101.61, avg loss: 0.45, rand prob: 0.1687717603768435

Saving model at epoch 2800: ./models/bipedal
Epoch: 2800, rwd total: -109.14, avg loss: 0.22, rand prob: 0.0

Epoch: 2801, rwd total: -108.32, avg loss: 0.38, rand prob: 0.16450295933133435

Epoch: 2802, rwd total: -105.75, avg loss: 0.42, rand prob: 0.05974499495436046

Epoch: 2803, rwd total: -99.68, avg loss: 0.37, rand prob: 0.09485291815985421

Epoch: 2804, rwd total: -120.05, avg loss: 0.49, rand prob: 0.12409517297747749

Epoc

Epoch: 2896, rwd total: -120.37, avg loss: 0.26, rand prob: 0.11069264700223597

Epoch: 2897, rwd total: -101.82, avg loss: 0.37, rand prob: 0.12070156942775197

Epoch: 2898, rwd total: -113.86, avg loss: 0.46, rand prob: 0.0769605725078294

Epoch: 2899, rwd total: -109.83, avg loss: 0.53, rand prob: 0.1393773576188955

Saving model at epoch 2900: ./models/bipedal
Epoch: 2900, rwd total: -98.83, avg loss: 0.37, rand prob: 0.0

Epoch: 2901, rwd total: -96.32, avg loss: 0.31, rand prob: 0.06007286546460479

Epoch: 2902, rwd total: -101.17, avg loss: 0.39, rand prob: 0.09678810169925557

Epoch: 2903, rwd total: -95.63, avg loss: 0.35, rand prob: 0.1503382120245924

Epoch: 2904, rwd total: -102.75, avg loss: 0.37, rand prob: 0.19100323197287955

Epoch: 2905, rwd total: -109.31, avg loss: 0.38, rand prob: 0.11810329680912131

Epoch: 2906, rwd total: -124.42, avg loss: 0.32, rand prob: 0.18366198241579768

Epoch: 2907, rwd total: -115.74, avg loss: 0.37, rand prob: 0.19000588846782696

Epoch

Epoch: 2999, rwd total: -105.24, avg loss: 0.33, rand prob: 0.10452577862809839

Saving model at epoch 3000: ./models/bipedal
Epoch: 3000, rwd total: -127.53, avg loss: 0.26, rand prob: 0.0

Epoch: 3001, rwd total: -105.48, avg loss: 0.41, rand prob: 0.16653765868370082

Epoch: 3002, rwd total: -98.72, avg loss: 0.37, rand prob: 0.18276869477701613

Epoch: 3003, rwd total: -107.42, avg loss: 0.39, rand prob: 0.08350997217236965

Epoch: 3004, rwd total: -125.06, avg loss: 0.39, rand prob: 0.10613589126737792

Epoch: 3005, rwd total: -107.68, avg loss: 0.5, rand prob: 0.058828825905456926

Epoch: 3006, rwd total: -113.08, avg loss: 0.5, rand prob: 0.12477008652940416

Epoch: 3007, rwd total: -100.26, avg loss: 0.29, rand prob: 0.15999217679594324

Epoch: 3008, rwd total: -110.94, avg loss: 0.39, rand prob: 0.15310739298234438

Epoch: 3009, rwd total: -103.27, avg loss: 0.47, rand prob: 0.08708895477562825

Epoch: 3010, rwd total: -100.65, avg loss: 0.39, rand prob: 0.0

Epoch: 3011, rwd 

Epoch: 3102, rwd total: -106.5, avg loss: 0.29, rand prob: 0.19848815253227098

Epoch: 3103, rwd total: -102.25, avg loss: 0.37, rand prob: 0.10670019387146103

Epoch: 3104, rwd total: -118.32, avg loss: 0.51, rand prob: 0.11537881256488092

Epoch: 3105, rwd total: -99.48, avg loss: 0.38, rand prob: 0.13252028348646744

Epoch: 3106, rwd total: -105.99, avg loss: 0.44, rand prob: 0.17586556835028594

Epoch: 3107, rwd total: -102.47, avg loss: 0.42, rand prob: 0.15924333129932955

Epoch: 3108, rwd total: -106.24, avg loss: 0.38, rand prob: 0.1930869567827302

Epoch: 3109, rwd total: -95.49, avg loss: 0.35, rand prob: 0.15324137751686945

Epoch: 3110, rwd total: -100.86, avg loss: 0.4, rand prob: 0.0

Epoch: 3111, rwd total: -100.17, avg loss: 0.35, rand prob: 0.09662880250118591

Epoch: 3112, rwd total: -99.56, avg loss: 0.41, rand prob: 0.1352582699531673

Epoch: 3113, rwd total: -112.78, avg loss: 0.45, rand prob: 0.16241541385839853

Epoch: 3114, rwd total: -119.52, avg loss: 0.36, ra

Epoch: 3205, rwd total: -108.35, avg loss: 0.44, rand prob: 0.1088984462129394

Epoch: 3206, rwd total: -103.24, avg loss: 0.4, rand prob: 0.15880117259103593

Epoch: 3207, rwd total: -100.13, avg loss: 0.38, rand prob: 0.09675609685328956

Epoch: 3208, rwd total: -123.69, avg loss: 0.36, rand prob: 0.13899128529969035

Epoch: 3209, rwd total: -101.67, avg loss: 0.39, rand prob: 0.07137564682165488

Epoch: 3210, rwd total: -98.56, avg loss: 0.37, rand prob: 0.0

Epoch: 3211, rwd total: -112.55, avg loss: 0.53, rand prob: 0.12445470370097826

Epoch: 3212, rwd total: -97.68, avg loss: 0.39, rand prob: 0.13362249974248938

Epoch: 3213, rwd total: -97.34, avg loss: 0.33, rand prob: 0.15553357065317397

Epoch: 3214, rwd total: -101.79, avg loss: 0.42, rand prob: 0.05272687758305082

Epoch: 3215, rwd total: -135.5, avg loss: 0.3, rand prob: 0.12298985502919747

Epoch: 3216, rwd total: -131.21, avg loss: 0.33, rand prob: 0.14565508269244282

Epoch: 3217, rwd total: -107.29, avg loss: 0.44, ra

Epoch: 3308, rwd total: -121.84, avg loss: 0.31, rand prob: 0.06069518878903488

Epoch: 3309, rwd total: -113.39, avg loss: 0.41, rand prob: 0.05612627965057916

Epoch: 3310, rwd total: -127.25, avg loss: 0.27, rand prob: 0.0

Epoch: 3311, rwd total: -111.21, avg loss: 0.37, rand prob: 0.08334873517675025

Epoch: 3312, rwd total: -127.02, avg loss: 0.45, rand prob: 0.06299636770884934

Epoch: 3313, rwd total: -132.73, avg loss: 0.34, rand prob: 0.09841511873115397

Epoch: 3314, rwd total: -106.4, avg loss: 0.33, rand prob: 0.14027707377971393

Epoch: 3315, rwd total: -101.76, avg loss: 0.27, rand prob: 0.06530519146501243

Epoch: 3316, rwd total: -122.28, avg loss: 0.46, rand prob: 0.13780299856973394

Epoch: 3317, rwd total: -123.41, avg loss: 0.39, rand prob: 0.19295760782586852

Epoch: 3318, rwd total: -126.63, avg loss: 0.34, rand prob: 0.1570192186566386

Epoch: 3319, rwd total: -103.01, avg loss: 0.3, rand prob: 0.18488644239201962

Epoch: 3320, rwd total: -127.73, avg loss: 0.26

Epoch: 3411, rwd total: -125.01, avg loss: 0.3, rand prob: 0.15131642524867375

Epoch: 3412, rwd total: -111.26, avg loss: 0.26, rand prob: 0.054868836631367636

Epoch: 3413, rwd total: -126.58, avg loss: 0.28, rand prob: 0.13347625553804737

Epoch: 3414, rwd total: -134.86, avg loss: 0.39, rand prob: 0.06833359521354385

Epoch: 3415, rwd total: -123.42, avg loss: 0.47, rand prob: 0.07124174311060355

Epoch: 3416, rwd total: -121.0, avg loss: 0.36, rand prob: 0.16761435075769554

Epoch: 3417, rwd total: -127.05, avg loss: 0.3, rand prob: 0.1434142661905124

Epoch: 3418, rwd total: -117.09, avg loss: 0.29, rand prob: 0.1924622482272325

Epoch: 3419, rwd total: -111.6, avg loss: 0.36, rand prob: 0.09277198565685782

Epoch: 3420, rwd total: -25.36, avg loss: 0.36, rand prob: 0.0

Epoch: 3421, rwd total: -117.78, avg loss: 0.34, rand prob: 0.19745926167872274

Epoch: 3422, rwd total: -95.46, avg loss: 0.31, rand prob: 0.15273086054243548

Epoch: 3423, rwd total: -101.67, avg loss: 0.33, ra

Epoch: 3514, rwd total: -110.52, avg loss: 0.3, rand prob: 0.08198528159139998

Epoch: 3515, rwd total: -108.38, avg loss: 0.42, rand prob: 0.12256053638290719

Epoch: 3516, rwd total: -120.51, avg loss: 0.54, rand prob: 0.06019974471870086

Epoch: 3517, rwd total: -140.06, avg loss: 0.36, rand prob: 0.15731098697413506

Epoch: 3518, rwd total: -124.55, avg loss: 0.41, rand prob: 0.06117500010703324

Epoch: 3519, rwd total: -102.63, avg loss: 0.5, rand prob: 0.14574642121653775

Epoch: 3520, rwd total: -128.84, avg loss: 0.27, rand prob: 0.0

Epoch: 3521, rwd total: -100.68, avg loss: 0.41, rand prob: 0.06172196638583849

Epoch: 3522, rwd total: -106.85, avg loss: 0.5, rand prob: 0.117003629182458

Epoch: 3523, rwd total: -105.39, avg loss: 0.43, rand prob: 0.12005087577924813

Epoch: 3524, rwd total: -103.23, avg loss: 0.43, rand prob: 0.1114629913574342

Epoch: 3525, rwd total: -106.43, avg loss: 0.47, rand prob: 0.19640617079944633

Epoch: 3526, rwd total: -103.27, avg loss: 0.44, r

Epoch: 3617, rwd total: -131.67, avg loss: 0.38, rand prob: 0.12561283580185595

Epoch: 3618, rwd total: -129.13, avg loss: 0.25, rand prob: 0.09931401518358123

Epoch: 3619, rwd total: -125.51, avg loss: 0.34, rand prob: 0.15806795467370313

Epoch: 3620, rwd total: -131.45, avg loss: 0.23, rand prob: 0.0

Epoch: 3621, rwd total: -113.36, avg loss: 0.39, rand prob: 0.1819154747415519

Epoch: 3622, rwd total: -120.04, avg loss: 0.32, rand prob: 0.15974670248979395

Epoch: 3623, rwd total: -113.34, avg loss: 0.43, rand prob: 0.1666395804837271

Epoch: 3624, rwd total: -123.01, avg loss: 0.33, rand prob: 0.08618383861221518

Epoch: 3625, rwd total: -123.94, avg loss: 0.44, rand prob: 0.11607979836765663

Epoch: 3626, rwd total: -112.19, avg loss: 0.41, rand prob: 0.16109154855496932

Epoch: 3627, rwd total: -104.34, avg loss: 0.28, rand prob: 0.0689584853970127

Epoch: 3628, rwd total: -109.52, avg loss: 0.31, rand prob: 0.09848866212852342

Epoch: 3629, rwd total: -123.74, avg loss: 0.42

Epoch: 3720, rwd total: -127.88, avg loss: 0.26, rand prob: 0.0

Epoch: 3721, rwd total: -106.44, avg loss: 0.39, rand prob: 0.05396316046795814

Epoch: 3722, rwd total: -101.01, avg loss: 0.32, rand prob: 0.09071025030713749

Epoch: 3723, rwd total: -99.3, avg loss: 0.35, rand prob: 0.1527768893631649

Epoch: 3724, rwd total: -105.75, avg loss: 0.49, rand prob: 0.12918313747093307

Epoch: 3725, rwd total: -128.83, avg loss: 0.26, rand prob: 0.09266309347821863

Epoch: 3726, rwd total: -132.15, avg loss: 0.27, rand prob: 0.115752497307194

Epoch: 3727, rwd total: -125.83, avg loss: 0.28, rand prob: 0.074673307123692

Epoch: 3728, rwd total: -113.46, avg loss: 0.32, rand prob: 0.19838689208730115

Epoch: 3729, rwd total: -107.21, avg loss: 0.47, rand prob: 0.07637326842535097

Epoch: 3730, rwd total: -98.55, avg loss: 0.37, rand prob: 0.0

Epoch: 3731, rwd total: -127.49, avg loss: 0.38, rand prob: 0.05717764254795327

Epoch: 3732, rwd total: -114.63, avg loss: 0.39, rand prob: 0.194565

Epoch: 3824, rwd total: -109.13, avg loss: 0.3, rand prob: 0.11347939965080922

Epoch: 3825, rwd total: -118.54, avg loss: 0.44, rand prob: 0.08955857293593683

Epoch: 3826, rwd total: -139.73, avg loss: 0.35, rand prob: 0.08607470472906048

Epoch: 3827, rwd total: -106.59, avg loss: 0.38, rand prob: 0.06909313363064162

Epoch: 3828, rwd total: -121.29, avg loss: 0.36, rand prob: 0.17001747186916788

Epoch: 3829, rwd total: -107.73, avg loss: 0.38, rand prob: 0.17035787849246234

Epoch: 3830, rwd total: -109.43, avg loss: 0.22, rand prob: 0.0

Epoch: 3831, rwd total: -118.04, avg loss: 0.26, rand prob: 0.15985730323194208

Epoch: 3832, rwd total: -113.46, avg loss: 0.3, rand prob: 0.05074384463708631

Epoch: 3833, rwd total: -110.27, avg loss: 0.52, rand prob: 0.14252081169709158

Epoch: 3834, rwd total: -116.66, avg loss: 0.43, rand prob: 0.09690664038362819

Epoch: 3835, rwd total: -103.67, avg loss: 0.26, rand prob: 0.09192817692776525

Epoch: 3836, rwd total: -114.45, avg loss: 0.4

Epoch: 3927, rwd total: -134.7, avg loss: 0.23, rand prob: 0.13882266468885618

Epoch: 3928, rwd total: -103.34, avg loss: 0.38, rand prob: 0.16524229460954412

Epoch: 3929, rwd total: -34.35, avg loss: 0.61, rand prob: 0.1160610552393106

Epoch: 3930, rwd total: -98.41, avg loss: 0.37, rand prob: 0.0

Epoch: 3931, rwd total: -118.98, avg loss: 0.41, rand prob: 0.166800773683085

Epoch: 3932, rwd total: -103.72, avg loss: 0.39, rand prob: 0.07127823511748305

Epoch: 3933, rwd total: -119.21, avg loss: 0.33, rand prob: 0.16841879868453355

Epoch: 3934, rwd total: -121.69, avg loss: 0.41, rand prob: 0.18048643409639542

Epoch: 3935, rwd total: -120.83, avg loss: 0.28, rand prob: 0.18208740465323597

Epoch: 3936, rwd total: -113.58, avg loss: 0.48, rand prob: 0.12050974031717965

Epoch: 3937, rwd total: -103.65, avg loss: 0.44, rand prob: 0.167435498985752

Epoch: 3938, rwd total: -133.74, avg loss: 0.37, rand prob: 0.15356561427774268

Epoch: 3939, rwd total: -109.43, avg loss: 0.4, rand

Epoch: 4030, rwd total: -129.02, avg loss: 0.26, rand prob: 0.0

Epoch: 4031, rwd total: -111.31, avg loss: 0.42, rand prob: 0.18504535291641216

Epoch: 4032, rwd total: -108.49, avg loss: 0.41, rand prob: 0.15873427824573233

Epoch: 4033, rwd total: -115.84, avg loss: 0.39, rand prob: 0.06557224988221835

Epoch: 4034, rwd total: -140.9, avg loss: 0.29, rand prob: 0.07858463147168028

Epoch: 4035, rwd total: -126.25, avg loss: 0.25, rand prob: 0.0961499352376359

Epoch: 4036, rwd total: -117.77, avg loss: 0.32, rand prob: 0.08999636038011517

Epoch: 4037, rwd total: -107.84, avg loss: 0.37, rand prob: 0.1306233848541225

Epoch: 4038, rwd total: -115.77, avg loss: 0.44, rand prob: 0.13967153016431563

Epoch: 4039, rwd total: -115.36, avg loss: 0.42, rand prob: 0.16153492299184832

Epoch: 4040, rwd total: -99.08, avg loss: 0.27, rand prob: 0.0

Epoch: 4041, rwd total: -105.43, avg loss: 0.37, rand prob: 0.050745062594291804

Epoch: 4042, rwd total: -130.85, avg loss: 0.35, rand prob: 0.0

Epoch: 4133, rwd total: -113.23, avg loss: 0.48, rand prob: 0.18127157282612066

Epoch: 4134, rwd total: -106.89, avg loss: 0.4, rand prob: 0.12547131659338465

Epoch: 4135, rwd total: -112.34, avg loss: 0.54, rand prob: 0.1376812896720881

Epoch: 4136, rwd total: -103.22, avg loss: 0.33, rand prob: 0.1385139203710293

Epoch: 4137, rwd total: -145.85, avg loss: 0.28, rand prob: 0.1792208969356049

Epoch: 4138, rwd total: -125.28, avg loss: 0.4, rand prob: 0.08160194764918234

Epoch: 4139, rwd total: -106.7, avg loss: 0.4, rand prob: 0.10679424412014676

Epoch: 4140, rwd total: -99.13, avg loss: 0.38, rand prob: 0.0

Epoch: 4141, rwd total: -106.62, avg loss: 0.45, rand prob: 0.1469894081374282

Epoch: 4142, rwd total: -128.47, avg loss: 0.34, rand prob: 0.18105915856384625

Epoch: 4143, rwd total: -110.94, avg loss: 0.27, rand prob: 0.16704576575892527

Epoch: 4144, rwd total: -101.15, avg loss: 0.3, rand prob: 0.18206785791998692

Epoch: 4145, rwd total: -136.16, avg loss: 0.25, rand 

Epoch: 4236, rwd total: -100.83, avg loss: 0.29, rand prob: 0.12057811995765876

Epoch: 4237, rwd total: -128.1, avg loss: 0.27, rand prob: 0.1382561878954488

Epoch: 4238, rwd total: -122.64, avg loss: 0.37, rand prob: 0.10601952562721519

Epoch: 4239, rwd total: -96.72, avg loss: 0.28, rand prob: 0.15207504486385615

Epoch: 4240, rwd total: -105.12, avg loss: 0.37, rand prob: 0.0

Epoch: 4241, rwd total: -105.23, avg loss: 0.41, rand prob: 0.07453674975341684

Epoch: 4242, rwd total: -107.1, avg loss: 0.41, rand prob: 0.17943741340862268

Epoch: 4243, rwd total: -110.76, avg loss: 0.48, rand prob: 0.07473931352435914

Epoch: 4244, rwd total: -103.76, avg loss: 0.37, rand prob: 0.1244755540059555

Epoch: 4245, rwd total: -109.04, avg loss: 0.5, rand prob: 0.1960036438759366

Epoch: 4246, rwd total: -102.02, avg loss: 0.41, rand prob: 0.09220784213564026

Epoch: 4247, rwd total: -114.74, avg loss: 0.52, rand prob: 0.12186627658374284

Epoch: 4248, rwd total: -109.09, avg loss: 0.44, ra

Epoch: 4339, rwd total: -102.68, avg loss: 0.44, rand prob: 0.10120592903904954

Epoch: 4340, rwd total: -98.52, avg loss: 0.37, rand prob: 0.0

Epoch: 4341, rwd total: -116.54, avg loss: 0.36, rand prob: 0.17245101591124612

Epoch: 4342, rwd total: -99.07, avg loss: 0.39, rand prob: 0.19613287500794135

Epoch: 4343, rwd total: -106.56, avg loss: 0.39, rand prob: 0.18116732255073886

Epoch: 4344, rwd total: -111.27, avg loss: 0.34, rand prob: 0.162622710542713

Epoch: 4345, rwd total: -107.86, avg loss: 0.55, rand prob: 0.12660088886619017

Epoch: 4346, rwd total: -118.4, avg loss: 0.48, rand prob: 0.07096393188061378

Epoch: 4347, rwd total: -106.33, avg loss: 0.34, rand prob: 0.08056313211578561

Epoch: 4348, rwd total: -129.99, avg loss: 0.29, rand prob: 0.17952911361110518

Epoch: 4349, rwd total: -94.93, avg loss: 0.32, rand prob: 0.06663815388046566

Saving model at epoch 4350: ./models/bipedal
Epoch: 4350, rwd total: -97.72, avg loss: 0.28, rand prob: 0.0

Epoch: 4351, rwd total

Epoch: 4442, rwd total: -126.57, avg loss: 0.3, rand prob: 0.13093599040711082

Epoch: 4443, rwd total: -127.37, avg loss: 0.48, rand prob: 0.16553726773516006

Epoch: 4444, rwd total: -115.68, avg loss: 0.44, rand prob: 0.1848239022179573

Epoch: 4445, rwd total: -102.66, avg loss: 0.4, rand prob: 0.1720370391027008

Epoch: 4446, rwd total: -98.7, avg loss: 0.34, rand prob: 0.0627349304514394

Epoch: 4447, rwd total: -120.01, avg loss: 0.34, rand prob: 0.11364210909912198

Epoch: 4448, rwd total: -101.39, avg loss: 0.4, rand prob: 0.09224912179646277

Epoch: 4449, rwd total: -101.53, avg loss: 0.4, rand prob: 0.19992810072554829

Saving model at epoch 4450: ./models/bipedal
Epoch: 4450, rwd total: -99.59, avg loss: 0.38, rand prob: 0.0

Epoch: 4451, rwd total: -108.11, avg loss: 0.33, rand prob: 0.10577672014823485

Epoch: 4452, rwd total: -119.08, avg loss: 0.36, rand prob: 0.12794947178136692

Epoch: 4453, rwd total: -105.42, avg loss: 0.42, rand prob: 0.16448283523291457

Epoch: 44

Epoch: 4545, rwd total: -110.93, avg loss: 0.39, rand prob: 0.064433004991388

Epoch: 4546, rwd total: -111.71, avg loss: 0.31, rand prob: 0.17390158733151656

Epoch: 4547, rwd total: -99.28, avg loss: 0.42, rand prob: 0.11378830689351499

Epoch: 4548, rwd total: -101.37, avg loss: 0.38, rand prob: 0.15536044372356267

Epoch: 4549, rwd total: -118.94, avg loss: 0.28, rand prob: 0.08910820759122012

Saving model at epoch 4550: ./models/bipedal
Epoch: 4550, rwd total: -100.03, avg loss: 0.38, rand prob: 0.0

Epoch: 4551, rwd total: -97.69, avg loss: 0.35, rand prob: 0.1821895811030006

Epoch: 4552, rwd total: -117.08, avg loss: 0.41, rand prob: 0.05358756134735226

Epoch: 4553, rwd total: -10.99, avg loss: 1.38, rand prob: 0.08275210541480763

Epoch: 4554, rwd total: -113.89, avg loss: 0.49, rand prob: 0.16529423828491244

Epoch: 4555, rwd total: -122.07, avg loss: 0.35, rand prob: 0.15517970410071602

Epoch: 4556, rwd total: -119.89, avg loss: 0.53, rand prob: 0.06660440103160506

Epoch

Epoch: 4648, rwd total: -100.08, avg loss: 0.41, rand prob: 0.13646041325778524

Epoch: 4649, rwd total: -115.71, avg loss: 0.49, rand prob: 0.1377421245888547

Saving model at epoch 4650: ./models/bipedal
Epoch: 4650, rwd total: -101.03, avg loss: 0.4, rand prob: 0.0

Epoch: 4651, rwd total: -112.93, avg loss: 0.53, rand prob: 0.07415393389219488

Epoch: 4652, rwd total: -106.32, avg loss: 0.46, rand prob: 0.1712456628533167

Epoch: 4653, rwd total: -98.15, avg loss: 0.35, rand prob: 0.15086187083996722

Epoch: 4654, rwd total: -107.16, avg loss: 0.39, rand prob: 0.1584423162085075

Epoch: 4655, rwd total: -99.32, avg loss: 0.32, rand prob: 0.16478914847208032

Epoch: 4656, rwd total: -106.01, avg loss: 0.4, rand prob: 0.10799691067782864

Epoch: 4657, rwd total: -114.64, avg loss: 0.43, rand prob: 0.09589609428117539

Epoch: 4658, rwd total: -104.94, avg loss: 0.37, rand prob: 0.14714716444924625

Epoch: 4659, rwd total: -102.46, avg loss: 0.43, rand prob: 0.08491212053398572

Epoch:

Epoch: 4750, rwd total: -100.82, avg loss: 0.4, rand prob: 0.0

Epoch: 4751, rwd total: -98.99, avg loss: 0.36, rand prob: 0.07654718355013225

Epoch: 4752, rwd total: -109.21, avg loss: 0.44, rand prob: 0.1557409189783503

Epoch: 4753, rwd total: -107.99, avg loss: 0.34, rand prob: 0.09754147844811242

Epoch: 4754, rwd total: -112.3, avg loss: 0.44, rand prob: 0.09669737542342675

Epoch: 4755, rwd total: -124.84, avg loss: 0.27, rand prob: 0.13148873409915052

Epoch: 4756, rwd total: -110.14, avg loss: 0.5, rand prob: 0.09600568127924083

Epoch: 4757, rwd total: -98.79, avg loss: 0.35, rand prob: 0.06069751976417302

Epoch: 4758, rwd total: -119.15, avg loss: 0.48, rand prob: 0.19803249638364107

Epoch: 4759, rwd total: -119.0, avg loss: 0.37, rand prob: 0.1617436727733088

Epoch: 4760, rwd total: -130.55, avg loss: 0.24, rand prob: 0.0

Epoch: 4761, rwd total: -112.85, avg loss: 0.32, rand prob: 0.07401033675258159

Epoch: 4762, rwd total: -127.29, avg loss: 0.4, rand prob: 0.0583711

Epoch: 4853, rwd total: -103.94, avg loss: 0.4, rand prob: 0.1704808831726362

Epoch: 4854, rwd total: -109.5, avg loss: 0.48, rand prob: 0.16551501348307962

Epoch: 4855, rwd total: -30.96, avg loss: 0.8, rand prob: 0.19628779114466233

Epoch: 4856, rwd total: -104.53, avg loss: 0.37, rand prob: 0.14857125293117723

Epoch: 4857, rwd total: -109.4, avg loss: 0.44, rand prob: 0.08287760581305825

Epoch: 4858, rwd total: -113.03, avg loss: 0.57, rand prob: 0.19832056827581634

Epoch: 4859, rwd total: -100.54, avg loss: 0.38, rand prob: 0.12260581926548529

Epoch: 4860, rwd total: -100.86, avg loss: 0.39, rand prob: 0.0

Epoch: 4861, rwd total: -97.58, avg loss: 0.36, rand prob: 0.11697234938358102

Epoch: 4862, rwd total: -106.04, avg loss: 0.38, rand prob: 0.10108536929266523

Epoch: 4863, rwd total: -125.1, avg loss: 0.31, rand prob: 0.0725915986687461

Epoch: 4864, rwd total: -127.14, avg loss: 0.3, rand prob: 0.06268677876470709

Epoch: 4865, rwd total: -19.78, avg loss: 0.84, rand p

Epoch: 4956, rwd total: -103.34, avg loss: 0.36, rand prob: 0.12403771497085608

Epoch: 4957, rwd total: -20.52, avg loss: 0.73, rand prob: 0.11008623006646667

Epoch: 4958, rwd total: -106.36, avg loss: 0.47, rand prob: 0.192694966871309

Epoch: 4959, rwd total: -107.5, avg loss: 0.41, rand prob: 0.0720005609504949

Epoch: 4960, rwd total: -99.44, avg loss: 0.3, rand prob: 0.0

Epoch: 4961, rwd total: -110.34, avg loss: 0.46, rand prob: 0.06869313162532044

Epoch: 4962, rwd total: -115.65, avg loss: 0.37, rand prob: 0.17548348137316833

Epoch: 4963, rwd total: -99.84, avg loss: 0.29, rand prob: 0.1798647440941536

Epoch: 4964, rwd total: -114.98, avg loss: 0.35, rand prob: 0.13953572260787522

Epoch: 4965, rwd total: -124.82, avg loss: 0.29, rand prob: 0.12881580179530894

Epoch: 4966, rwd total: -130.72, avg loss: 0.29, rand prob: 0.16691053476540751

Epoch: 4967, rwd total: -106.65, avg loss: 0.48, rand prob: 0.06038314915892683

Epoch: 4968, rwd total: -111.54, avg loss: 0.38, rand

In [7]:
print(num_frames)
print(len(all_observations))
print(len(all_actions))
print(len(all_rewards))
print(len(all_logits))
print(len(discounted_rewards))
print(len(all_loss_adders))


values = discounted_rewards

for this_value1, this_value2, this_value_3 in zip(all_logits, all_loss_adders, all_rewards):
    print("{}, {}, {}".format(this_value1, this_value2, this_value_3))


68
68
68
68
68
68
68
[ 0.33399928  0.49272266 -0.23483634  0.44599265], 0, -0.05175616260617972
[ 0.32071358  0.4921382  -0.21540529  0.4469517 ], 0, -0.04350652573506156
[ 0.37528372  0.49938813 -0.35571864  0.44664004], 0, -0.037606954952079864
[ 0.3450482   0.5041796  -0.38113162  0.45256233], 0, -0.044626839242875575
[ 0.30360532  0.50542474 -0.43276712  0.4560808 ], 0, -0.05070750096440315
[ 0.23425584  0.521073   -0.3965598   0.4615007 ], 0, -0.05592019775261718
[ 0.15533109  0.5333569  -0.38124308  0.47286797], 0, -0.07413928828636568
[ 0.03977159  0.5487703  -0.35491726  0.48616666], 0, -0.0896236710300072
[ 0.02831535  0.5501761  -0.36589625  0.48787957], 0, -0.08857485595345498
[ 0.04353684  0.54838294 -0.36761585  0.4861381 ], 0, -0.08969329470396042
[ 0.08899294  0.53946984 -0.37121043  0.4829104 ], 0, -0.08659402571121616
[ 0.09789293  0.5390068  -0.37961385  0.48225576], 0, -0.08282382069031517
[ 0.10794453  0.5381584  -0.3880082   0.48146835], 0, -0.0786551425655671
[ 0.

In [3]:

env = gym.make(environment_name)
observation = env.reset()

print("obs.shape: {}".format(observation.shape)) #obs.shape: (210, 160, 3)
print("env.action_space: {}".format(env.action_space)) #env.action_space: Discrete(9)


for _ in range(14):
    env.render()
    action = env.action_space.sample()
    print("action: {}".format(action))
    print(type(action))
    observation, reward_float, done_bool, info_dict = env.step(action) # take a random action
    
    print("observation: {}".format(observation))
    print("reward_float: {}".format(reward_float))
    print("done_bool: {}".format(done_bool))
    print("info_dict: {}".format(info_dict))

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
obs.shape: (24,)
env.action_space: Box(4,)
action: [0.09762701 0.43037874 0.20552675 0.08976637]
<class 'numpy.ndarray'>
observation: [ 0.00245932 -0.00691658  0.00590502  0.01961307 -0.29061297 -0.71013689
  1.4722293   0.99358988  1.          0.30079773 -0.01692069  0.1633203
  0.33325795  1.          0.45289648  0.45803979  0.4740701   0.50296849
  0.54874223  0.61897415  0.72858626  0.91021472  1.          1.        ]
reward_float: -0.01773284123155096
done_bool: False
info_dict: {}
action: [-0.1526904   0.29178822 -0.12482557  0.78354603]
<class 'numpy.ndarray'>
observation: [ 0.00255965  0.00256462  0.00806203  0.00679228 -0.01382717 -0.39574927
  1.08375649  0.          1.          0.26067981 -0.41715193  0.26995051
  0.24234966  1.          0.45600694  0.46118557  0