In [1]:
#https://colab.research.google.com/drive/1ExE__T9e2dMDKbxrJfgp8jP0So8umC-A#sandboxMode=true&scrollTo=OvvBAoQVJsuU

from collections import deque
import random
import math
import time
from tqdm import tqdm

import numpy as np
import tensorflow as tf
import gym
import gym_examples


In [2]:
'''
env = gym.make('CartPole-v1')
num_features = env.observation_space.shape[0]
num_actions = env.action_space.n
print('Number of state features: {}'.format(num_features))
print('Number of possible actions: {}'.format(num_actions))
'''

"\nenv = gym.make('CartPole-v1')\nnum_features = env.observation_space.shape[0]\nnum_actions = env.action_space.n\nprint('Number of state features: {}'.format(num_features))\nprint('Number of possible actions: {}'.format(num_actions))\n"

In [3]:

env = gym.make('gym_examples/PuckWorld-v0', reward1=True, reward2=True, render_mode=None, fps=60)  #'human'
env._max_episode_steps = 600
#num_actions = 5 #PuckWorld: 0,1,2,3,4 represent left, right, up, down, -, five moves.
num_features = env.observation_space.shape[0]
num_actions = env.action_space.n
print('Number of state features: {}'.format(num_features))
print('Number of possible actions: {}'.format(num_actions))

Number of state features: 8
Number of possible actions: 5


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [4]:
class DQN(tf.keras.Model):
    """Dense neural network class."""
    def __init__(self):
        super(DQN, self).__init__()
        self.dense1 = tf.keras.layers.Dense(32, activation="relu")
        self.dense2 = tf.keras.layers.Dense(32, activation="relu")
        self.dense3 = tf.keras.layers.Dense(num_actions, dtype=tf.float32) # No activation
    
    def call(self, x):
        """Forward pass."""
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3(x)

main_nn = DQN()
target_nn = DQN()

optimizer = tf.keras.optimizers.Adam(1e-4)
mse = tf.keras.losses.MeanSquaredError()

In [5]:
class ReplayBuffer(object):
    """Experience replay buffer that samples uniformly."""
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def __len__(self):
        return len(self.buffer)

    def sample(self, num_samples):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        idx = np.random.choice(len(self.buffer), num_samples)
        for i in idx:
            elem = self.buffer[i]
            state, action, reward, next_state, done = elem
            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards, dtype=np.float32)
        next_states = np.array(next_states)
        dones = np.array(dones, dtype=np.float32)
        return states, actions, rewards, next_states, dones

In [6]:
def select_epsilon_greedy_action(state, epsilon):
    """Take random action with probability epsilon, else take best action."""
    result = tf.random.uniform((1,))
    if result < epsilon:
        return env.action_space.sample() # Random action (left or right).
    else:
        return tf.argmax(main_nn(state)[0]).numpy() # Greedy action for state.

In [7]:
@tf.function
def train_step(states, actions, rewards, next_states, dones):
    """Perform a training iteration on a batch of data sampled from the experience
    replay buffer."""
    # Calculate targets.
    next_qs = target_nn(next_states)
    max_next_qs = tf.reduce_max(next_qs, axis=-1)
    target = rewards + (1. - dones) * discount * max_next_qs
    with tf.GradientTape() as tape:
        qs = main_nn(states)
        action_masks = tf.one_hot(actions, num_actions)
        masked_qs = tf.reduce_sum(action_masks * qs, axis=-1)
        loss = mse(target, masked_qs)
    grads = tape.gradient(loss, main_nn.trainable_variables)
    optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))
    return loss

In [8]:
# Hyperparameters.
num_episodes = 1000
epsilon = 1
batch_size = 32
discount = 0.99
buffer = ReplayBuffer(100000)
cur_frame = 0

In [10]:


# Start training. Play game once and then train with a batch.
last_10_ep_rewards = []
for episode in tqdm(range(num_episodes+1)):
    state, info = env.reset()  
    ep_reward, done, truncated = 0, False, False
    while not (done or truncated):
        state_in = tf.expand_dims(state, axis=0)
        action = select_epsilon_greedy_action(state_in, epsilon)
        next_state, reward, done, truncated, info = env.step(action)
        ep_reward += reward
        # Save to experience replay.
        buffer.add(state, action, reward, next_state, done or truncated)
        state = next_state
        cur_frame += 1
        # Copy main_nn weights to target_nn.
        if cur_frame % 2000 == 0:
            target_nn.set_weights(main_nn.get_weights())

        # Train neural network.
        if len(buffer) >= batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)
            loss = train_step(states, actions, rewards, next_states, dones)
  
    if episode < 950:
        epsilon -= 0.001
    if epsilon<0.005:
        epsilon=0.005

    if len(last_10_ep_rewards) == 10:
        last_10_ep_rewards = last_10_ep_rewards[1:]
    last_10_ep_rewards.append(ep_reward)
    
    if episode % 10 == 0:
        print(f'Episode {episode}/{num_episodes}. Epsilon: {epsilon:.3f}. '
            f'Reward in last 10 episodes: {np.mean(last_10_ep_rewards):.3f}')
env.close()


  0%|                                                                                 | 1/1001 [00:02<36:24,  2.18s/it]

Episode 0/1000. Epsilon: 0.049. Reward in last 10 episodes: -253.596


  1%|▉                                                                               | 11/1001 [00:23<35:46,  2.17s/it]

Episode 10/1000. Epsilon: 0.039. Reward in last 10 episodes: -226.114


  2%|█▋                                                                              | 21/1001 [00:45<35:33,  2.18s/it]

Episode 20/1000. Epsilon: 0.029. Reward in last 10 episodes: -216.446


  3%|██▍                                                                             | 31/1001 [01:08<37:38,  2.33s/it]

Episode 30/1000. Epsilon: 0.019. Reward in last 10 episodes: -223.167


  4%|███▎                                                                            | 41/1001 [01:31<35:58,  2.25s/it]

Episode 40/1000. Epsilon: 0.009. Reward in last 10 episodes: -213.318


  5%|████                                                                            | 51/1001 [01:53<36:46,  2.32s/it]

Episode 50/1000. Epsilon: 0.005. Reward in last 10 episodes: -217.808


  6%|████▉                                                                           | 61/1001 [02:16<35:27,  2.26s/it]

Episode 60/1000. Epsilon: 0.005. Reward in last 10 episodes: -231.994


  7%|█████▋                                                                          | 71/1001 [02:39<35:25,  2.28s/it]

Episode 70/1000. Epsilon: 0.005. Reward in last 10 episodes: -246.298


  8%|██████▍                                                                         | 81/1001 [03:02<35:46,  2.33s/it]

Episode 80/1000. Epsilon: 0.005. Reward in last 10 episodes: -201.879


  9%|███████▎                                                                        | 91/1001 [03:28<39:01,  2.57s/it]

Episode 90/1000. Epsilon: 0.005. Reward in last 10 episodes: -217.667


 10%|███████▉                                                                       | 101/1001 [03:54<39:12,  2.61s/it]

Episode 100/1000. Epsilon: 0.005. Reward in last 10 episodes: -225.635


 11%|████████▊                                                                      | 111/1001 [04:19<37:55,  2.56s/it]

Episode 110/1000. Epsilon: 0.005. Reward in last 10 episodes: -209.797


 12%|█████████▌                                                                     | 121/1001 [04:46<38:56,  2.65s/it]

Episode 120/1000. Epsilon: 0.005. Reward in last 10 episodes: -223.920


 13%|██████████▎                                                                    | 131/1001 [05:14<40:34,  2.80s/it]

Episode 130/1000. Epsilon: 0.005. Reward in last 10 episodes: -237.035


 14%|███████████▏                                                                   | 141/1001 [05:42<40:33,  2.83s/it]

Episode 140/1000. Epsilon: 0.005. Reward in last 10 episodes: -208.405


 15%|███████████▉                                                                   | 151/1001 [06:08<36:34,  2.58s/it]

Episode 150/1000. Epsilon: 0.005. Reward in last 10 episodes: -215.326


 16%|████████████▋                                                                  | 161/1001 [06:33<35:24,  2.53s/it]

Episode 160/1000. Epsilon: 0.005. Reward in last 10 episodes: -233.598


 17%|█████████████▍                                                                 | 171/1001 [06:59<35:11,  2.54s/it]

Episode 170/1000. Epsilon: 0.005. Reward in last 10 episodes: -214.902


 18%|██████████████▎                                                                | 181/1001 [07:25<35:10,  2.57s/it]

Episode 180/1000. Epsilon: 0.005. Reward in last 10 episodes: -224.382


 19%|███████████████                                                                | 191/1001 [07:52<36:26,  2.70s/it]

Episode 190/1000. Epsilon: 0.005. Reward in last 10 episodes: -203.697


 20%|███████████████▊                                                               | 201/1001 [08:18<33:42,  2.53s/it]

Episode 200/1000. Epsilon: 0.005. Reward in last 10 episodes: -232.291


 21%|████████████████▋                                                              | 211/1001 [08:43<32:56,  2.50s/it]

Episode 210/1000. Epsilon: 0.005. Reward in last 10 episodes: -236.277


 22%|█████████████████▍                                                             | 221/1001 [09:08<34:06,  2.62s/it]

Episode 220/1000. Epsilon: 0.005. Reward in last 10 episodes: -224.537


 23%|██████████████████▏                                                            | 231/1001 [09:34<32:25,  2.53s/it]

Episode 230/1000. Epsilon: 0.005. Reward in last 10 episodes: -209.070


 24%|███████████████████                                                            | 241/1001 [10:01<35:07,  2.77s/it]

Episode 240/1000. Epsilon: 0.005. Reward in last 10 episodes: -207.176


 25%|███████████████████▊                                                           | 251/1001 [10:27<31:39,  2.53s/it]

Episode 250/1000. Epsilon: 0.005. Reward in last 10 episodes: -228.138


 26%|████████████████████▌                                                          | 261/1001 [10:52<30:58,  2.51s/it]

Episode 260/1000. Epsilon: 0.005. Reward in last 10 episodes: -257.369


 27%|█████████████████████▍                                                         | 271/1001 [11:18<31:20,  2.58s/it]

Episode 270/1000. Epsilon: 0.005. Reward in last 10 episodes: -260.641


 28%|██████████████████████▏                                                        | 281/1001 [11:43<30:01,  2.50s/it]

Episode 280/1000. Epsilon: 0.005. Reward in last 10 episodes: -294.269


 29%|██████████████████████▉                                                        | 291/1001 [12:09<30:57,  2.62s/it]

Episode 290/1000. Epsilon: 0.005. Reward in last 10 episodes: -203.816


 30%|███████████████████████▊                                                       | 301/1001 [12:35<29:51,  2.56s/it]

Episode 300/1000. Epsilon: 0.005. Reward in last 10 episodes: -235.632


 31%|████████████████████████▌                                                      | 311/1001 [13:00<29:48,  2.59s/it]

Episode 310/1000. Epsilon: 0.005. Reward in last 10 episodes: -235.271


 32%|█████████████████████████▎                                                     | 321/1001 [13:27<29:56,  2.64s/it]

Episode 320/1000. Epsilon: 0.005. Reward in last 10 episodes: -231.299


 33%|██████████████████████████                                                     | 331/1001 [13:54<29:36,  2.65s/it]

Episode 330/1000. Epsilon: 0.005. Reward in last 10 episodes: -252.067


 34%|██████████████████████████▉                                                    | 341/1001 [14:19<27:56,  2.54s/it]

Episode 340/1000. Epsilon: 0.005. Reward in last 10 episodes: -239.685


 35%|███████████████████████████▋                                                   | 351/1001 [14:46<28:23,  2.62s/it]

Episode 350/1000. Epsilon: 0.005. Reward in last 10 episodes: -253.288


 36%|████████████████████████████▍                                                  | 361/1001 [15:12<29:19,  2.75s/it]

Episode 360/1000. Epsilon: 0.005. Reward in last 10 episodes: -216.241


 37%|█████████████████████████████▎                                                 | 371/1001 [15:38<27:34,  2.63s/it]

Episode 370/1000. Epsilon: 0.005. Reward in last 10 episodes: -211.770


 38%|██████████████████████████████                                                 | 381/1001 [16:06<29:25,  2.85s/it]

Episode 380/1000. Epsilon: 0.005. Reward in last 10 episodes: -196.464


 39%|██████████████████████████████▊                                                | 391/1001 [16:31<25:58,  2.56s/it]

Episode 390/1000. Epsilon: 0.005. Reward in last 10 episodes: -203.380


 40%|███████████████████████████████▋                                               | 401/1001 [16:57<25:48,  2.58s/it]

Episode 400/1000. Epsilon: 0.005. Reward in last 10 episodes: -226.338


 41%|████████████████████████████████▍                                              | 411/1001 [17:24<26:02,  2.65s/it]

Episode 410/1000. Epsilon: 0.005. Reward in last 10 episodes: -236.924


 42%|█████████████████████████████████▏                                             | 421/1001 [17:50<25:00,  2.59s/it]

Episode 420/1000. Epsilon: 0.005. Reward in last 10 episodes: -244.233


 43%|██████████████████████████████████                                             | 431/1001 [18:17<24:42,  2.60s/it]

Episode 430/1000. Epsilon: 0.005. Reward in last 10 episodes: -236.408


 44%|██████████████████████████████████▊                                            | 441/1001 [18:42<23:44,  2.54s/it]

Episode 440/1000. Epsilon: 0.005. Reward in last 10 episodes: -228.661


 45%|███████████████████████████████████▌                                           | 451/1001 [19:08<24:29,  2.67s/it]

Episode 450/1000. Epsilon: 0.005. Reward in last 10 episodes: -219.563


 46%|████████████████████████████████████▍                                          | 461/1001 [19:34<23:32,  2.62s/it]

Episode 460/1000. Epsilon: 0.005. Reward in last 10 episodes: -244.258


 47%|█████████████████████████████████████▏                                         | 471/1001 [20:01<23:56,  2.71s/it]

Episode 470/1000. Epsilon: 0.005. Reward in last 10 episodes: -205.515


 48%|█████████████████████████████████████▉                                         | 481/1001 [20:26<21:56,  2.53s/it]

Episode 480/1000. Epsilon: 0.005. Reward in last 10 episodes: -229.753


 49%|██████████████████████████████████████▊                                        | 491/1001 [20:51<21:28,  2.53s/it]

Episode 490/1000. Epsilon: 0.005. Reward in last 10 episodes: -240.863


 50%|███████████████████████████████████████▌                                       | 501/1001 [21:17<21:30,  2.58s/it]

Episode 500/1000. Epsilon: 0.005. Reward in last 10 episodes: -253.320


 51%|████████████████████████████████████████▎                                      | 511/1001 [21:43<20:45,  2.54s/it]

Episode 510/1000. Epsilon: 0.005. Reward in last 10 episodes: -214.252


 52%|█████████████████████████████████████████                                      | 521/1001 [22:10<21:53,  2.74s/it]

Episode 520/1000. Epsilon: 0.005. Reward in last 10 episodes: -213.462


 53%|█████████████████████████████████████████▉                                     | 531/1001 [22:36<21:15,  2.71s/it]

Episode 530/1000. Epsilon: 0.005. Reward in last 10 episodes: -180.484


 54%|██████████████████████████████████████████▋                                    | 541/1001 [23:02<19:36,  2.56s/it]

Episode 540/1000. Epsilon: 0.005. Reward in last 10 episodes: -235.356


 55%|███████████████████████████████████████████▍                                   | 551/1001 [23:28<19:21,  2.58s/it]

Episode 550/1000. Epsilon: 0.005. Reward in last 10 episodes: -222.721


 56%|████████████████████████████████████████████▎                                  | 561/1001 [23:54<19:15,  2.63s/it]

Episode 560/1000. Epsilon: 0.005. Reward in last 10 episodes: -236.781


 57%|█████████████████████████████████████████████                                  | 571/1001 [24:20<18:13,  2.54s/it]

Episode 570/1000. Epsilon: 0.005. Reward in last 10 episodes: -258.370


 58%|█████████████████████████████████████████████▊                                 | 581/1001 [24:47<18:18,  2.62s/it]

Episode 580/1000. Epsilon: 0.005. Reward in last 10 episodes: -222.591


 59%|██████████████████████████████████████████████▋                                | 591/1001 [25:13<18:35,  2.72s/it]

Episode 590/1000. Epsilon: 0.005. Reward in last 10 episodes: -241.668


 60%|███████████████████████████████████████████████▍                               | 601/1001 [25:39<16:58,  2.55s/it]

Episode 600/1000. Epsilon: 0.005. Reward in last 10 episodes: -228.477


 61%|████████████████████████████████████████████████▏                              | 611/1001 [26:05<17:11,  2.64s/it]

Episode 610/1000. Epsilon: 0.005. Reward in last 10 episodes: -261.771


 62%|█████████████████████████████████████████████████                              | 621/1001 [26:31<16:21,  2.58s/it]

Episode 620/1000. Epsilon: 0.005. Reward in last 10 episodes: -226.870


 63%|█████████████████████████████████████████████████▊                             | 631/1001 [26:57<15:52,  2.58s/it]

Episode 630/1000. Epsilon: 0.005. Reward in last 10 episodes: -278.700


 64%|██████████████████████████████████████████████████▌                            | 641/1001 [27:25<15:49,  2.64s/it]

Episode 640/1000. Epsilon: 0.005. Reward in last 10 episodes: -219.318


 65%|███████████████████████████████████████████████████▍                           | 651/1001 [27:50<14:42,  2.52s/it]

Episode 650/1000. Epsilon: 0.005. Reward in last 10 episodes: -205.381


 66%|████████████████████████████████████████████████████▏                          | 661/1001 [28:17<15:55,  2.81s/it]

Episode 660/1000. Epsilon: 0.005. Reward in last 10 episodes: -226.700


 67%|████████████████████████████████████████████████████▉                          | 671/1001 [28:43<14:23,  2.62s/it]

Episode 670/1000. Epsilon: 0.005. Reward in last 10 episodes: -241.313


 68%|█████████████████████████████████████████████████████▋                         | 681/1001 [29:11<14:51,  2.79s/it]

Episode 680/1000. Epsilon: 0.005. Reward in last 10 episodes: -221.665


 69%|██████████████████████████████████████████████████████▌                        | 691/1001 [29:37<13:35,  2.63s/it]

Episode 690/1000. Epsilon: 0.005. Reward in last 10 episodes: -258.963


 70%|███████████████████████████████████████████████████████▎                       | 701/1001 [30:04<13:36,  2.72s/it]

Episode 700/1000. Epsilon: 0.005. Reward in last 10 episodes: -255.330


 71%|████████████████████████████████████████████████████████                       | 711/1001 [30:30<12:47,  2.65s/it]

Episode 710/1000. Epsilon: 0.005. Reward in last 10 episodes: -240.647


 72%|████████████████████████████████████████████████████████▉                      | 721/1001 [30:57<12:08,  2.60s/it]

Episode 720/1000. Epsilon: 0.005. Reward in last 10 episodes: -232.999


 73%|█████████████████████████████████████████████████████████▋                     | 731/1001 [31:23<11:33,  2.57s/it]

Episode 730/1000. Epsilon: 0.005. Reward in last 10 episodes: -237.144


 74%|██████████████████████████████████████████████████████████▍                    | 741/1001 [31:48<11:03,  2.55s/it]

Episode 740/1000. Epsilon: 0.005. Reward in last 10 episodes: -261.573


 75%|███████████████████████████████████████████████████████████▎                   | 751/1001 [32:14<10:47,  2.59s/it]

Episode 750/1000. Epsilon: 0.005. Reward in last 10 episodes: -261.130


 76%|████████████████████████████████████████████████████████████                   | 761/1001 [32:41<10:43,  2.68s/it]

Episode 760/1000. Epsilon: 0.005. Reward in last 10 episodes: -232.296


 77%|████████████████████████████████████████████████████████████▊                  | 771/1001 [33:07<10:05,  2.63s/it]

Episode 770/1000. Epsilon: 0.005. Reward in last 10 episodes: -286.964


 78%|█████████████████████████████████████████████████████████████▋                 | 781/1001 [33:33<09:35,  2.62s/it]

Episode 780/1000. Epsilon: 0.005. Reward in last 10 episodes: -256.818


 79%|██████████████████████████████████████████████████████████████▍                | 791/1001 [34:00<09:32,  2.73s/it]

Episode 790/1000. Epsilon: 0.005. Reward in last 10 episodes: -221.850


 80%|███████████████████████████████████████████████████████████████▏               | 801/1001 [34:26<08:33,  2.57s/it]

Episode 800/1000. Epsilon: 0.005. Reward in last 10 episodes: -243.609


 81%|████████████████████████████████████████████████████████████████               | 811/1001 [34:51<08:09,  2.58s/it]

Episode 810/1000. Epsilon: 0.005. Reward in last 10 episodes: -252.997


 82%|████████████████████████████████████████████████████████████████▊              | 821/1001 [35:18<08:06,  2.70s/it]

Episode 820/1000. Epsilon: 0.005. Reward in last 10 episodes: -295.604


 83%|█████████████████████████████████████████████████████████████████▌             | 831/1001 [35:45<07:46,  2.74s/it]

Episode 830/1000. Epsilon: 0.005. Reward in last 10 episodes: -267.526


 84%|██████████████████████████████████████████████████████████████████▎            | 841/1001 [36:13<07:23,  2.77s/it]

Episode 840/1000. Epsilon: 0.005. Reward in last 10 episodes: -255.066


 85%|███████████████████████████████████████████████████████████████████▏           | 851/1001 [36:41<06:49,  2.73s/it]

Episode 850/1000. Epsilon: 0.005. Reward in last 10 episodes: -305.986


 86%|███████████████████████████████████████████████████████████████████▉           | 861/1001 [37:08<06:34,  2.82s/it]

Episode 860/1000. Epsilon: 0.005. Reward in last 10 episodes: -245.942


 87%|████████████████████████████████████████████████████████████████████▋          | 871/1001 [37:36<05:52,  2.71s/it]

Episode 870/1000. Epsilon: 0.005. Reward in last 10 episodes: -288.919


 88%|█████████████████████████████████████████████████████████████████████▌         | 881/1001 [38:04<05:48,  2.90s/it]

Episode 880/1000. Epsilon: 0.005. Reward in last 10 episodes: -254.845


 89%|██████████████████████████████████████████████████████████████████████▎        | 891/1001 [38:31<04:57,  2.70s/it]

Episode 890/1000. Epsilon: 0.005. Reward in last 10 episodes: -260.896


 90%|███████████████████████████████████████████████████████████████████████        | 901/1001 [38:57<04:27,  2.68s/it]

Episode 900/1000. Epsilon: 0.005. Reward in last 10 episodes: -262.505


 91%|███████████████████████████████████████████████████████████████████████▉       | 911/1001 [39:25<04:07,  2.75s/it]

Episode 910/1000. Epsilon: 0.005. Reward in last 10 episodes: -273.915


 92%|████████████████████████████████████████████████████████████████████████▋      | 921/1001 [39:52<03:33,  2.67s/it]

Episode 920/1000. Epsilon: 0.005. Reward in last 10 episodes: -206.025


 93%|█████████████████████████████████████████████████████████████████████████▍     | 931/1001 [40:20<03:08,  2.70s/it]

Episode 930/1000. Epsilon: 0.005. Reward in last 10 episodes: -299.851


 94%|██████████████████████████████████████████████████████████████████████████▎    | 941/1001 [40:47<02:41,  2.70s/it]

Episode 940/1000. Epsilon: 0.005. Reward in last 10 episodes: -248.468


 95%|███████████████████████████████████████████████████████████████████████████    | 951/1001 [41:15<02:25,  2.91s/it]

Episode 950/1000. Epsilon: 0.005. Reward in last 10 episodes: -223.458


 96%|███████████████████████████████████████████████████████████████████████████▊   | 961/1001 [41:42<01:48,  2.72s/it]

Episode 960/1000. Epsilon: 0.005. Reward in last 10 episodes: -261.914


 97%|████████████████████████████████████████████████████████████████████████████▋  | 971/1001 [42:10<01:23,  2.79s/it]

Episode 970/1000. Epsilon: 0.005. Reward in last 10 episodes: -241.559


 98%|█████████████████████████████████████████████████████████████████████████████▍ | 981/1001 [42:37<00:53,  2.68s/it]

Episode 980/1000. Epsilon: 0.005. Reward in last 10 episodes: -263.766


 99%|██████████████████████████████████████████████████████████████████████████████▏| 991/1001 [43:04<00:26,  2.68s/it]

Episode 990/1000. Epsilon: 0.005. Reward in last 10 episodes: -242.826


100%|██████████████████████████████████████████████████████████████████████████████| 1001/1001 [43:32<00:00,  2.61s/it]

Episode 1000/1000. Epsilon: 0.005. Reward in last 10 episodes: -240.968





In [None]:
envh = gym.make('gym_examples/PuckWorld-v0', reward1=True, reward2=True, render_mode='human', fps=60)  #'human'
env._max_episode_steps = 600
#envh = env = gym.make('CartPole-v1', render_mode='human')
state, info = envh.reset()
done, truncated = False, False
ep_rew = 0
while not done or truncated:
    envh.render()
    state = tf.expand_dims(state, axis=0)
    action = select_epsilon_greedy_action(state, epsilon=0.01)
    state, reward, done, truncated, info = envh.step(action)
    ep_rew += reward
print('Episode reward was {}'.format(ep_rew))
env.close()


In [None]:
state
