In [1]:
import gymnasium as gym
import numpy as np

In [2]:
gym.pprint_registry()

===== classic_control =====
Acrobot-v1             CartPole-v0            CartPole-v1
MountainCar-v0         MountainCarContinuous-v0 Pendulum-v1
===== phys2d =====
phys2d/CartPole-v0     phys2d/CartPole-v1     phys2d/Pendulum-v0
===== box2d =====
BipedalWalker-v3       BipedalWalkerHardcore-v3 CarRacing-v3
LunarLander-v3         LunarLanderContinuous-v3
===== toy_text =====
Blackjack-v1           CliffWalking-v0        FrozenLake-v1
FrozenLake8x8-v1       Taxi-v3
===== tabular =====
tabular/Blackjack-v0   tabular/CliffWalking-v0
===== mujoco =====
Ant-v2                 Ant-v3                 Ant-v4
Ant-v5                 HalfCheetah-v2         HalfCheetah-v3
HalfCheetah-v4         HalfCheetah-v5         Hopper-v2
Hopper-v3              Hopper-v4              Hopper-v5
Humanoid-v2            Humanoid-v3            Humanoid-v4
Humanoid-v5            HumanoidStandup-v2     HumanoidStandup-v4
HumanoidStandup-v5     InvertedDoublePendulum-v2 InvertedDoublePendulum-v4
InvertedDoublePendulu

In [3]:
env = gym.make('CliffWalking-v0', render_mode='human')

In [9]:
class QLearningAgent:
    
    def __init__(
        self,
        environ, 
        n_episodes,
        epsilon=0.6,
        learning_rate=0.9,
        discount_factor=0.95,
        random=False,
        max_epsilon=0.1
        ) -> None:
        
        self.env = environ
        self.nr_of_actions = environ.action_space.n
        self.nr_of_states = environ.observation_space.n
        
        self.epsilon = epsilon
        self.epsilon_decay = self.epsilon / (n_episodes / 2)
        self.max_epsilon = max_epsilon
        self.lr = learning_rate
        self.gamma = discount_factor
        
        # in this case [0, 1] indicates state 0 with action 1
        if random:
            self.q_function = np.random.rand(self.nr_of_states, self.nr_of_actions)
        else:
            self.q_function = np.zeros((self.nr_of_states, self.nr_of_actions))
        
    def update(
        self,
        state,
        action,
        reward, 
        terminated,
        next_state
        ) -> None:
        # get the max_a q(s, a)
        greedy_action = np.max(self.q_function[next_state]) if not terminated else 0
        self.q_function[state, action] += self.lr * (reward + self.gamma * greedy_action - self.q_function[state, action])
    
    def epsilon_greedy(
        self,
        state
    ) -> np.int64:
        if np.random.random() < self.epsilon:
            action = self.env.action_space.sample()
        else:
            action = int(np.argmax(self.q_function[state]))
        return action 
    
    def decay_epsilon(self) -> None:
        self.epsilon = max(self.max_epsilon, self.epsilon - self.epsilon_decay)
        
    def predict(
        self,
        state
        ) -> np.int64:
        return int(np.argmax(self.q_function[state]))

In [10]:
from tqdm import tqdm

n_eps = 100
agent = QLearningAgent(environ=env, n_episodes=n_eps)

env = gym.wrappers.RecordEpisodeStatistics(env=env, buffer_length=n_eps)

for episode in tqdm(range(n_eps)):
    curr_state, info = env.reset()
    done = False
    
    print(f'Episode: {episode+1} | epsilon: {agent.epsilon}')
    
    while not done:
        action = agent.epsilon_greedy(curr_state)
        next_state, reward, terminated, truncated, info = env.step(action)
        
        agent.update(curr_state, action, reward, terminated, next_state)
        
        done = terminated or truncated
        curr_state = next_state
    
    agent.decay_epsilon()

  0%|          | 0/100 [00:00<?, ?it/s]

Episode: 1 | epsilon: 0.6


  1%|          | 1/100 [04:24<7:17:14, 265.00s/it]

Episode: 2 | epsilon: 0.588


  2%|▏         | 2/100 [05:09<3:41:01, 135.32s/it]

Episode: 3 | epsilon: 0.576


  3%|▎         | 3/100 [05:42<2:23:15, 88.61s/it] 

Episode: 4 | epsilon: 0.564


  4%|▍         | 4/100 [06:00<1:37:11, 60.75s/it]

Episode: 5 | epsilon: 0.5519999999999999


  5%|▌         | 5/100 [06:48<1:28:47, 56.08s/it]

Episode: 6 | epsilon: 0.5399999999999999


  6%|▌         | 6/100 [07:07<1:08:23, 43.65s/it]

Episode: 7 | epsilon: 0.5279999999999999


  7%|▋         | 7/100 [07:24<53:55, 34.79s/it]  

Episode: 8 | epsilon: 0.5159999999999999


  8%|▊         | 8/100 [07:50<48:56, 31.92s/it]

Episode: 9 | epsilon: 0.5039999999999999


  9%|▉         | 9/100 [08:27<50:57, 33.60s/it]

Episode: 10 | epsilon: 0.4919999999999999


 10%|█         | 10/100 [08:48<44:20, 29.56s/it]

Episode: 11 | epsilon: 0.47999999999999987


 11%|█         | 11/100 [08:55<33:50, 22.81s/it]

Episode: 12 | epsilon: 0.46799999999999986


 12%|█▏        | 12/100 [09:05<27:44, 18.92s/it]

Episode: 13 | epsilon: 0.45599999999999985


 13%|█▎        | 13/100 [09:12<22:05, 15.23s/it]

Episode: 14 | epsilon: 0.44399999999999984


 14%|█▍        | 14/100 [09:21<19:08, 13.35s/it]

Episode: 15 | epsilon: 0.43199999999999983


 15%|█▌        | 15/100 [09:45<23:28, 16.57s/it]

Episode: 16 | epsilon: 0.4199999999999998


 16%|█▌        | 16/100 [10:00<22:39, 16.18s/it]

Episode: 17 | epsilon: 0.4079999999999998


 17%|█▋        | 17/100 [10:13<20:51, 15.08s/it]

Episode: 18 | epsilon: 0.3959999999999998


 18%|█▊        | 18/100 [10:23<18:31, 13.55s/it]

Episode: 19 | epsilon: 0.3839999999999998


 19%|█▉        | 19/100 [10:30<15:57, 11.82s/it]

Episode: 20 | epsilon: 0.3719999999999998


 20%|██        | 20/100 [10:38<14:01, 10.52s/it]

Episode: 21 | epsilon: 0.35999999999999976


 21%|██        | 21/100 [10:48<13:44, 10.44s/it]

Episode: 22 | epsilon: 0.34799999999999975


 22%|██▏       | 22/100 [11:02<14:46, 11.36s/it]

Episode: 23 | epsilon: 0.33599999999999974


 23%|██▎       | 23/100 [11:08<12:31,  9.76s/it]

Episode: 24 | epsilon: 0.32399999999999973


 24%|██▍       | 24/100 [11:13<10:44,  8.48s/it]

Episode: 25 | epsilon: 0.3119999999999997


 25%|██▌       | 25/100 [11:20<09:51,  7.89s/it]

Episode: 26 | epsilon: 0.2999999999999997


 26%|██▌       | 26/100 [11:24<08:17,  6.72s/it]

Episode: 27 | epsilon: 0.2879999999999997


 27%|██▋       | 27/100 [11:40<11:34,  9.51s/it]

Episode: 28 | epsilon: 0.2759999999999997


 28%|██▊       | 28/100 [11:51<12:02, 10.04s/it]

Episode: 29 | epsilon: 0.2639999999999997


 29%|██▉       | 29/100 [11:55<09:49,  8.30s/it]

Episode: 30 | epsilon: 0.25199999999999967


 30%|███       | 30/100 [12:08<11:09,  9.57s/it]

Episode: 31 | epsilon: 0.23999999999999966


 31%|███       | 31/100 [12:15<10:01,  8.72s/it]

Episode: 32 | epsilon: 0.22799999999999965


 32%|███▏      | 32/100 [12:20<08:52,  7.84s/it]

Episode: 33 | epsilon: 0.21599999999999964


 33%|███▎      | 33/100 [12:24<07:27,  6.69s/it]

Episode: 34 | epsilon: 0.20399999999999963


 34%|███▍      | 34/100 [12:28<06:28,  5.88s/it]

Episode: 35 | epsilon: 0.19199999999999962


 35%|███▌      | 35/100 [12:32<05:36,  5.17s/it]

Episode: 36 | epsilon: 0.1799999999999996


 36%|███▌      | 36/100 [12:41<06:54,  6.47s/it]

Episode: 37 | epsilon: 0.1679999999999996


 37%|███▋      | 37/100 [12:46<06:10,  5.88s/it]

Episode: 38 | epsilon: 0.15599999999999958


 38%|███▊      | 38/100 [12:51<05:53,  5.69s/it]

Episode: 39 | epsilon: 0.14399999999999957


 39%|███▉      | 39/100 [12:57<05:43,  5.64s/it]

Episode: 40 | epsilon: 0.13199999999999956


 40%|████      | 40/100 [13:01<05:17,  5.30s/it]

Episode: 41 | epsilon: 0.11999999999999957


 41%|████      | 41/100 [13:05<04:49,  4.91s/it]

Episode: 42 | epsilon: 0.10799999999999957


 42%|████▏     | 42/100 [13:09<04:20,  4.49s/it]

Episode: 43 | epsilon: 0.1


 43%|████▎     | 43/100 [13:13<04:07,  4.34s/it]

Episode: 44 | epsilon: 0.1


 44%|████▍     | 44/100 [13:16<03:49,  4.09s/it]

Episode: 45 | epsilon: 0.1


 45%|████▌     | 45/100 [13:20<03:35,  3.91s/it]

Episode: 46 | epsilon: 0.1


 46%|████▌     | 46/100 [13:23<03:24,  3.79s/it]

Episode: 47 | epsilon: 0.1


 47%|████▋     | 47/100 [13:28<03:32,  4.00s/it]

Episode: 48 | epsilon: 0.1


 48%|████▊     | 48/100 [13:31<03:20,  3.85s/it]

Episode: 49 | epsilon: 0.1


 49%|████▉     | 49/100 [13:35<03:18,  3.90s/it]

Episode: 50 | epsilon: 0.1


 50%|█████     | 50/100 [13:39<03:09,  3.78s/it]

Episode: 51 | epsilon: 0.1


 51%|█████     | 51/100 [13:42<03:01,  3.70s/it]

Episode: 52 | epsilon: 0.1


 52%|█████▏    | 52/100 [13:46<02:54,  3.64s/it]

Episode: 53 | epsilon: 0.1


 53%|█████▎    | 53/100 [13:50<02:56,  3.75s/it]

Episode: 54 | epsilon: 0.1


 54%|█████▍    | 54/100 [13:53<02:49,  3.68s/it]

Episode: 55 | epsilon: 0.1


 55%|█████▌    | 55/100 [13:59<03:10,  4.23s/it]

Episode: 56 | epsilon: 0.1


 56%|█████▌    | 56/100 [14:05<03:29,  4.76s/it]

Episode: 57 | epsilon: 0.1


 57%|█████▋    | 57/100 [14:10<03:31,  4.91s/it]

Episode: 58 | epsilon: 0.1


 58%|█████▊    | 58/100 [14:13<03:08,  4.48s/it]

Episode: 59 | epsilon: 0.1


 59%|█████▉    | 59/100 [14:18<03:10,  4.64s/it]

Episode: 60 | epsilon: 0.1


 60%|██████    | 60/100 [14:22<02:52,  4.30s/it]

Episode: 61 | epsilon: 0.1


 61%|██████    | 61/100 [14:30<03:34,  5.49s/it]

Episode: 62 | epsilon: 0.1


 62%|██████▏   | 62/100 [14:34<03:08,  4.97s/it]

Episode: 63 | epsilon: 0.1


 63%|██████▎   | 63/100 [14:37<02:47,  4.53s/it]

Episode: 64 | epsilon: 0.1


 64%|██████▍   | 64/100 [14:41<02:37,  4.37s/it]

Episode: 65 | epsilon: 0.1


 65%|██████▌   | 65/100 [14:45<02:29,  4.26s/it]

Episode: 66 | epsilon: 0.1


 66%|██████▌   | 66/100 [14:51<02:40,  4.71s/it]

Episode: 67 | epsilon: 0.1


 67%|██████▋   | 67/100 [14:56<02:33,  4.65s/it]

Episode: 68 | epsilon: 0.1


 68%|██████▊   | 68/100 [15:00<02:22,  4.46s/it]

Episode: 69 | epsilon: 0.1


 69%|██████▉   | 69/100 [15:03<02:09,  4.17s/it]

Episode: 70 | epsilon: 0.1


 70%|███████   | 70/100 [15:10<02:26,  4.87s/it]

Episode: 71 | epsilon: 0.1


 71%|███████   | 71/100 [15:14<02:13,  4.61s/it]

Episode: 72 | epsilon: 0.1


 72%|███████▏  | 72/100 [15:17<01:59,  4.28s/it]

Episode: 73 | epsilon: 0.1


 73%|███████▎  | 73/100 [15:21<01:49,  4.04s/it]

Episode: 74 | epsilon: 0.1


 74%|███████▍  | 74/100 [15:25<01:42,  3.96s/it]

Episode: 75 | epsilon: 0.1


 75%|███████▌  | 75/100 [15:32<02:07,  5.10s/it]

Episode: 76 | epsilon: 0.1


 76%|███████▌  | 76/100 [15:36<01:50,  4.62s/it]

Episode: 77 | epsilon: 0.1


 77%|███████▋  | 77/100 [15:39<01:38,  4.29s/it]

Episode: 78 | epsilon: 0.1


 78%|███████▊  | 78/100 [15:44<01:35,  4.35s/it]

Episode: 79 | epsilon: 0.1


 79%|███████▉  | 79/100 [15:47<01:26,  4.10s/it]

Episode: 80 | epsilon: 0.1


 80%|████████  | 80/100 [15:51<01:18,  3.92s/it]

Episode: 81 | epsilon: 0.1


 81%|████████  | 81/100 [15:55<01:14,  3.95s/it]

Episode: 82 | epsilon: 0.1


 82%|████████▏ | 82/100 [16:02<01:30,  5.01s/it]

Episode: 83 | epsilon: 0.1


 83%|████████▎ | 83/100 [16:07<01:22,  4.86s/it]

Episode: 84 | epsilon: 0.1


 84%|████████▍ | 84/100 [16:11<01:13,  4.61s/it]

Episode: 85 | epsilon: 0.1


 85%|████████▌ | 85/100 [16:15<01:06,  4.43s/it]

Episode: 86 | epsilon: 0.1


 86%|████████▌ | 86/100 [16:22<01:13,  5.28s/it]

Episode: 87 | epsilon: 0.1


 87%|████████▋ | 87/100 [16:26<01:01,  4.75s/it]

Episode: 88 | epsilon: 0.1


 88%|████████▊ | 88/100 [16:31<01:00,  5.05s/it]

Episode: 89 | epsilon: 0.1


 89%|████████▉ | 89/100 [16:36<00:53,  4.89s/it]

Episode: 90 | epsilon: 0.1


 90%|█████████ | 90/100 [16:40<00:47,  4.77s/it]

Episode: 91 | epsilon: 0.1


 91%|█████████ | 91/100 [16:44<00:39,  4.39s/it]

Episode: 92 | epsilon: 0.1


 92%|█████████▏| 92/100 [16:49<00:37,  4.65s/it]

Episode: 93 | epsilon: 0.1


 93%|█████████▎| 93/100 [16:55<00:34,  4.99s/it]

Episode: 94 | epsilon: 0.1


 94%|█████████▍| 94/100 [16:58<00:27,  4.54s/it]

Episode: 95 | epsilon: 0.1


 95%|█████████▌| 95/100 [17:02<00:21,  4.23s/it]

Episode: 96 | epsilon: 0.1


 96%|█████████▌| 96/100 [17:05<00:16,  4.01s/it]

Episode: 97 | epsilon: 0.1


 97%|█████████▋| 97/100 [17:10<00:12,  4.24s/it]

Episode: 98 | epsilon: 0.1


 98%|█████████▊| 98/100 [17:14<00:08,  4.17s/it]

Episode: 99 | epsilon: 0.1


 99%|█████████▉| 99/100 [17:18<00:04,  4.19s/it]

Episode: 100 | epsilon: 0.1


100%|██████████| 100/100 [17:25<00:00, 10.45s/it]


In [11]:
for iteration in range(15):
    curr_state, info = env.reset()
    done = False
    
    reward_total = 0
    while not done:
        action = agent.predict(curr_state)
        next_state, reward, terminated, truncated, info = env.step(action)
        
        reward_total += reward
        done = terminated or truncated
        curr_state = next_state
        
    print(f'Episode: {iteration+1} | total reward at this episode: {reward_total}')

Episode: 1 | total reward at this episode: -13
Episode: 2 | total reward at this episode: -13
Episode: 3 | total reward at this episode: -13
Episode: 4 | total reward at this episode: -13
Episode: 5 | total reward at this episode: -13
Episode: 6 | total reward at this episode: -13
Episode: 7 | total reward at this episode: -13
Episode: 8 | total reward at this episode: -13
Episode: 9 | total reward at this episode: -13
Episode: 10 | total reward at this episode: -13
Episode: 11 | total reward at this episode: -13
Episode: 12 | total reward at this episode: -13
Episode: 13 | total reward at this episode: -13
Episode: 14 | total reward at this episode: -13
Episode: 15 | total reward at this episode: -13
