In [8]:
%pip install gymnasium
%pip install gymnasium[box2d]

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting box2d-py==2.3.5
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[K     |████████████████████████████████| 374 kB 5.5 MB/s eta 0:00:01
[?25hCollecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 25.7 MB/s eta 0:00:01
[?25hCollecting pygame==2.1.3
  Downloading pygame-2.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 33.0 MB/s eta 0:00:01
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25ldone
[?25h  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp39-cp39-linux_x86_64.whl size=494480 sha256=6ffdd2ed61519ffd43b69a836c22283f07bdc

In [1]:
import gymnasium as gym
import numpy as np
import numba as nb
import tensorflow as tf
from matplotlib import pyplot as plt
import os
from collections import deque
import random
tf.compat.v1.enable_eager_execution()

2023-05-05 15:36:49.213460: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
def crop(img):
    img = img[:84, 6:90]
    return img

@nb.njit(fastmath=True)
def rgb_to_grey(img):
    """
    Convert an RGB image to greyscale using the weighted method.
    """
    num_rows, num_cols, _ = img.shape
    grey_img = np.empty((num_rows, num_cols), dtype=np.uint32)
    for i, row in enumerate(img):
        for j, rgb_pixel in enumerate(row):
            # Compute weighted sum of RGB channels
            grey_img[i, j] = 0.2989 * rgb_pixel[0] + 0.5870 * rgb_pixel[1] + 0.1140 * rgb_pixel[2]

    return grey_img

In [3]:
def get_actor():
    actor_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(64, kernel_size=8, strides=4, activation="relu", kernel_initializer='zeros', input_shape=(84,84,1,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Conv2D(128, kernel_size=4, strides=2, activation="relu", kernel_initializer='zeros'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Conv2D(128, kernel_size=3, strides=1, activation="relu", kernel_initializer='zeros'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1024, kernel_initializer='zeros', activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(3, kernel_initializer='zeros', activation="linear")
    ])
    return actor_model

def get_critic():
    # Pass state through a convolutional neural network
    state_in = tf.keras.layers.Input(shape=(84,84,1))
    state_out = tf.keras.layers.Conv2D(64, kernel_size=8, strides=4, activation="relu")(state_in)
    state_out = tf.keras.layers.BatchNormalization()(state_out)
    state_out = tf.keras.layers.Dropout(0.5)(state_out)
    state_out = tf.keras.layers.Conv2D(128, kernel_size=4, strides=2, activation="relu")(state_out)
    state_out = tf.keras.layers.BatchNormalization()(state_out)
    state_out = tf.keras.layers.Dropout(0.5)(state_out)
    state_out = tf.keras.layers.Conv2D(128, kernel_size=3, strides=1, activation="relu")(state_out)
    state_out = tf.keras.layers.BatchNormalization()(state_out)
    state_out = tf.keras.layers.Dropout(0.5)(state_out)
    state_out = tf.keras.layers.Flatten()(state_out)

    # Pass action through a dense layer
    action_in = tf.keras.layers.Input(shape=(3,))
    action_out = tf.keras.layers.Dense(32, activation="relu")(action_in)

    # Concatenate state and action outputs and pass through dense layers
    output = tf.keras.layers.Concatenate()([state_out, action_out])
    output = tf.keras.layers.Dense(1024, activation="relu")(output)
    output = tf.keras.layers.BatchNormalization()(output)
    output = tf.keras.layers.Dropout(0.5)(output)
    output = tf.keras.layers.Dense(1, activation="linear")(output)

    # Create model
    critic_model = tf.keras.Model([state_in, action_in], output)
    
    return critic_model

In [4]:
def get_actions(state_batch, gaus_mean, gaus_std):
#     fixed_actions = fixed_model(state_batch, training=True)
    residual_actions = residual_model(state_batch, training=True)
#     actions = fixed_actions + residual_actions
#     for action in actions:
#         for i, value in enumerate(action):
#             # Add Gaussian noise
#             action += np.random.normal(gaus_mean, gaus_std)
    return residual_actions

In [5]:
def get_action(state, residual_action, gaus_mean, gaus_std):
    # Get suggested action from actor
    fixed_action = fixed_model.predict(np.array([state,]), verbose=0)[0]
#     residual_action = residual_model.predict(np.array([state,]), verbose=0)[0]
    action = fixed_action + residual_action
    for i, value in enumerate(action):
        # Add Gaussian noise
        action += np.random.normal(gaus_mean, gaus_std)
        # Clip action
        if i == 0:
            action[0] = np.clip(value, -1, 1)
        else:
            action[i] = np.clip(value, 0, 1)
    return action

In [6]:
class Buffer:
    def __init__(self, buffer_capacity, batch_size):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.replay_buffer = deque(maxlen=buffer_capacity)
#         self.state_buffer = np.zeros((self.buffer_capacity, 84, 84, 1))
#         self.action_buffer = np.zeros((self.buffer_capacity, 3))
#         self.reward_buffer = np.zeros((self.buffer_capacity, 1))
#         self.next_state_buffer = np.zeros((self.buffer_capacity, 84, 84, 1))
        
    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
#         index = self.buffer_counter % self.buffer_capacity
        
        self.replay_buffer.append(obs_tuple)
#         self.state_buffer[index] = obs_tuple[0]
#         self.action_buffer[index] = obs_tuple[1]
#         self.reward_buffer[index] = obs_tuple[2]
#         self.next_state_buffer[index] = obs_tuple[3]

#         self.buffer_counter += 1
    
    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
#     @tf.function
    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch
    ):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = get_actions(next_state_batch, gaus_mean, gaus_std)
            y = reward_batch + γ * critic_model(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = get_actions(state_batch, gaus_mean, gaus_std)
            critic_value = critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, residual_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, residual_model.trainable_variables)
        )
        
    
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        
        batch = random.sample(self.replay_buffer, self.batch_size)
    
        state_batch = tf.convert_to_tensor(np.array([step[0] for step in batch]).reshape((self.batch_size, 84, 84, 1)))
        action_batch = tf.convert_to_tensor(np.array([step[1] for step in batch]).reshape((self.batch_size, 3)))
        reward_batch = tf.convert_to_tensor(np.array([step[2] for step in batch]).reshape((self.batch_size, 1)))
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(np.array([step[3] for step in batch]).reshape((self.batch_size, 84, 84, 1)))
        
#         # Randomly sample indices
#         batch_indices = np.random.choice(record_range, self.batch_size)

#         # Convert to tensors
#         state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
#         action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
#         reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
#         reward_batch = tf.cast(reward_batch, dtype=tf.float32)
#         next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)

In [7]:
fixed_model = get_actor()
residual_model = get_actor()
critic_model = get_critic()

# Making the weights equal initially
fixed_model.load_weights("./Downloads/my_checkpoint")

# Learning rate for actor-critic models
critic_lr = 0.0006
actor_lr = 0.0005

# Gauss parameters
gaus_mean = 0
gaus_std = 0.005

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

num_episodes = 500
# Discount factor for future rewards
γ = 0.975

buffer = Buffer(20000, 32)

env = gym.make("CarRacing-v2", domain_randomize=False, autoreset=False)
episode_rewards = np.empty(num_episodes)

for episode in range(num_episodes):
    observation, info = env.reset()
    # Stay still during initial zoom in
    for _ in range(50):
        action = [0.0, 0.0, 0.0]
        observation, reward, terminated, truncated, info = env.step(action)
    
    steps = 0
    reward_sum = 0
    reward_size = 0
    reward_exp = 936
    episode_reward = 0
    
    while True:
        steps += 1
        if (steps % 100 == 0):
            print(f"step {steps}")
        # Get and preprocess state image.
        prev_state_img = np.copy(observation)
        prev_state_img = crop(prev_state_img)
        prev_state_img = rgb_to_grey(prev_state_img).reshape((84,84,1))
        
        residual_action = residual_model.predict(np.array([prev_state_img,]), verbose=0)[0]
        action = get_action(prev_state_img, residual_action, gaus_mean, gaus_std)
        observation, reward, terminated, truncated, info = env.step(action)
        
        state_img = np.copy(observation)
        state_img = crop(state_img)
        state_img = rgb_to_grey(state_img).reshape((84,84,1))
        
        buffer.record((prev_state_img, residual_action, reward, state_img))
        
         # Count steps and measure reward because terminated does not work
        episode_reward += reward
        if reward > 0:
            reward_size = reward
            reward_sum += reward
            reward_exp = -2.128*(reward_size**2) + 20.65*reward_size + 919.0
        
        if (buffer.buffer_counter % 5 == 0) and (steps >= 32):
            buffer.learn()
        
        # Check for termination
        if (truncated) or (terminated) or (steps >= 1500) or (round(reward_sum) == round(reward_exp)):
            episode_rewards[episode] = episode_reward
            print(f"episode {episode} rewards: {episode_reward}")
            residual_model.save_weights('./weights/my_actor4')
            critic_model.save_weights('./weights/my_critic4')
            break

        prev_state_img = state_img

2023-05-05 09:29:40.531065: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-05 09:29:40.551518: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-05 09:29:40.551653: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-05 09:29:40.552318: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the approp

step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
episode 0 rewards: 905.1678714859338
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 1 rewards: 812.8014184397051
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 2 rewards: 829.9999999999847
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 3 rewards: 756.3513513513361
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 4 rewards: 619.2857142857044
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 5 rewards: 79.75728155340275
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 6 rewards: 787.7586206896402
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 7 rewards: 834.3286219081137
step 100
step 200
step 300
step 400
step 500
step 600
step 700
st

step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 69 rewards: 870.1567944250738
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 70 rewards: 648.0340557275455
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 71 rewards: 711.5573770491684
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 72 rewards: 748.003412969268
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 73 rewards: 887.2064056939381
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 74 rewards: 769.2384105960176
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 75 rewards: 727.3938223938095
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 76 rewards: 650.283018867909
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 

step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 138 rewards: 785.3986710963362
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 139 rewards: 726.874999999988
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 140 rewards: 751.4163822525434
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 141 rewards: 801.9072164948301
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 142 rewards: 768.9455782312839
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 143 rewards: 839.4827586206744
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 144 rewards: 826.7081850533691
step 100
step 200
step 300
step 400
step 500
step 600
step 700
step 800
step 900
episode 145 rewards: 660.5555555555428
step 100
step 200
step 300
step 400
step 

KeyboardInterrupt: 

In [7]:
fixed_model = get_actor()
residual_model = get_actor()

fixed_model.load_weights("./Downloads/my_checkpoint")
residual_model.load_weights("./weights/my_actor3")

2023-05-05 15:37:03.452149: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-05 15:37:03.474071: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-05 15:37:03.474206: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-05 15:37:03.474527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the approp

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fcdcc758ee0>

In [8]:


env = gym.make("CarRacing-v2", domain_randomize=False, autoreset=False)
episode_rewards = np.zeros(50)
for i in range(50):
    observation, info = env.reset()
    for _ in range(50):
        action = [0.0, 0.0, 0.0]
        observation, reward, terminated, truncated, info = env.step(action)
    steps = 0
    reward_sum = 0
    num_rewards = 0
    reward_size = 0
    reward_exp = 0
    total_reward = 0
    while True:
        state_img = crop(observation)
        state_img = rgb_to_grey(state_img).reshape((84,84,1))
        action = fixed_model.predict(np.array([state_img,]), verbose=0)[0] + residual_model.predict(np.array([state_img,]), verbose=0)[0]
        observation, reward, terminated, truncated, info = env.step(action)
        steps += 1
        total_reward += reward
        if reward > 0:
            reward_size = reward
            reward_sum += reward
            reward_exp = -2.128*(reward_size**2) + 20.65*reward_size + 919.0
        if terminated or truncated or (steps >= 1500) or (round(reward_sum) == round(-2.128*(reward_size**2) + 20.65*reward_size + 919.0)):
            episode_rewards[i] = total_reward
            print(f"episode {i} reward: {total_reward}")
#             print(round(-2.128*(reward_size**2) + 20.65*reward_size + 919.0,2))
#             print(round(reward_sum,2))
            break



2023-05-05 15:37:06.671674: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401
2023-05-05 15:37:07.507238: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


episode 0 reward: 71.66666666667138
episode 1 reward: 645.9638554216764
episode 2 reward: 890.5595667869959
episode 3 reward: 721.3934426229391
episode 4 reward: 798.9929328621771
episode 5 reward: 851.8085106382867
episode 6 reward: 861.8345323740848
episode 7 reward: 871.1654135338176
episode 8 reward: 886.3432835820739
episode 9 reward: 800.9107806691321
episode 10 reward: 855.5300353356746
episode 11 reward: 720.8844765342858
episode 12 reward: 750.878136200704
episode 13 reward: 897.7272727272574
episode 14 reward: 47.85714285714665
episode 15 reward: 897.2480620154956
episode 16 reward: 682.4193548387005
episode 17 reward: 828.0769230769126
episode 18 reward: 792.7551020408074
episode 19 reward: 701.7213114753979
episode 20 reward: 899.4812030075024
episode 21 reward: 848.0604982206286
episode 22 reward: 687.0069204152154
episode 23 reward: 685.1857585139228
episode 24 reward: 866.403508771914
episode 25 reward: 685.6451612903136
episode 26 reward: 712.9470198675401
episode 27 re

In [9]:
episode_rewards

array([ 71.66666667, 645.96385542, 890.55956679, 721.39344262,
       798.99293286, 851.80851064, 861.83453237, 871.16541353,
       886.34328358, 800.91078067, 855.53003534, 720.88447653,
       750.8781362 , 897.72727273,  47.85714286, 897.24806202,
       682.41935484, 828.07692308, 792.75510204, 701.72131148,
       899.48120301, 848.06049822, 687.00692042, 685.18575851,
       866.40350877, 685.64516129, 712.94701987, 871.16541353,
       724.44444444, 849.6366782 , 850.05494505, 674.23076923,
       774.25795053, 797.85714286, 632.27272727, 727.36842105,
        79.90494297, 650.58303887, 671.87116564, 767.06896552,
       833.05755396, 661.09756098, 599.80519481, 741.06557377,
       809.05904059, 693.19444444, 841.17021277, 871.29213483,
       867.96296296, 868.76811594])

In [None]:
fixed_model = get_actor()
residual_model = get_actor()

fixed_model.load_weights("./checkpoints/my_checkpoint")
residual_model.load_weights("./weights/my_actor3")

env = gym.make("CarRacing-v2", domain_randomize=False, autoreset=False, render_mode="human")
episode_rewards = np.zeros(50)
for i in range(50):
    observation, info = env.reset()
    for _ in range(50):
        action = [0.0, 0.0, 0.0]
        observation, reward, terminated, truncated, info = env.step(action)
    steps = 0
    reward_sum = 0
    num_rewards = 0
    reward_size = 0
    reward_exp = 0
    total_reward = 0
    while True:
        state_img = crop(observation)
        state_img = rgb_to_grey(state_img).reshape((84,84,1))
        action = fixed_model.predict(np.array([state_img,]), verbose=0)[0] + residual_model.predict(np.array([state_img,]), verbose=0)[0]
        observation, reward, terminated, truncated, info = env.step(action)
        steps += 1
        total_reward += reward
        if reward > 0:
            reward_size = reward
            reward_sum += reward
            reward_exp = -2.128*(reward_size**2) + 20.65*reward_size + 919.0
        if terminated or truncated or (steps >= 1500) or (round(reward_sum) == round(-2.128*(reward_size**2) + 20.65*reward_size + 919.0)):
            episode_rewards[i] = total_reward
            print(f"episode {i} reward: {total_reward}")
#             print(round(-2.128*(reward_size**2) + 20.65*reward_size + 919.0,2))
#             print(round(reward_sum,2))
            break

In [None]:
fixed_model = get_actor()
residual_model = get_actor()

fixed_model.load_weights("./checkpoints/my_checkpoint")
residual_model.load_weights("./weights/my_actor2")

env = gym.make("CarRacing-v2", domain_randomize=False, autoreset=False, render_mode="human")
episode_rewards = np.zeros(50)
for i in range(50):
    observation, info = env.reset()
    for _ in range(50):
        action = [0.0, 0.0, 0.0]
        observation, reward, terminated, truncated, info = env.step(action)
    steps = 0
    reward_sum = 0
    num_rewards = 0
    reward_size = 0
    reward_exp = 0
    total_reward = 0
    while True:
        state_img = crop(observation)
        state_img = rgb_to_grey(state_img).reshape((84,84,1))
        action = fixed_model.predict(np.array([state_img,]), verbose=0)[0] + residual_model.predict(np.array([state_img,]), verbose=0)[0]
        observation, reward, terminated, truncated, info = env.step(action)
        steps += 1
        total_reward += reward
        if reward > 0:
            reward_size = reward
            reward_sum += reward
            reward_exp = -2.128*(reward_size**2) + 20.65*reward_size + 919.0
        if terminated or truncated or (steps >= 1500) or (round(reward_sum) == round(-2.128*(reward_size**2) + 20.65*reward_size + 919.0)):
            episode_rewards[i] = total_reward
            print(f"episode {i} reward: {total_reward}")
#             print(round(-2.128*(reward_size**2) + 20.65*reward_size + 919.0,2))
#             print(round(reward_sum,2))
            break