In [2]:
import gymnasium as gym
import numpy as np

In [24]:
gym.pprint_registry()

===== classic_control =====
Acrobot-v1             CartPole-v0            CartPole-v1
MountainCar-v0         MountainCarContinuous-v0 Pendulum-v1
===== phys2d =====
phys2d/CartPole-v0     phys2d/CartPole-v1     phys2d/Pendulum-v0
===== box2d =====
BipedalWalker-v3       BipedalWalkerHardcore-v3 CarRacing-v3
LunarLander-v3         LunarLanderContinuous-v3
===== toy_text =====
Blackjack-v1           CliffWalking-v0        FrozenLake-v1
FrozenLake8x8-v1       Taxi-v3
===== tabular =====
tabular/Blackjack-v0   tabular/CliffWalking-v0
===== mujoco =====
Ant-v2                 Ant-v3                 Ant-v4
Ant-v5                 HalfCheetah-v2         HalfCheetah-v3
HalfCheetah-v4         HalfCheetah-v5         Hopper-v2
Hopper-v3              Hopper-v4              Hopper-v5
Humanoid-v2            Humanoid-v3            Humanoid-v4
Humanoid-v5            HumanoidStandup-v2     HumanoidStandup-v4
HumanoidStandup-v5     InvertedDoublePendulum-v2 InvertedDoublePendulum-v4
InvertedDoublePendulu

In [25]:
env = gym.make('CliffWalking-v0', render_mode='human')

In [51]:
class QLearningAgent:
    
    def __init__(
        self,
        environ, 
        n_episodes,
        epsilon=1,
        learning_rate=0.9,
        discount_factor=1,
        random=False,
        max_epsilon=0.1
        ) -> None:
        
        self.env = environ
        self.nr_of_actions = environ.action_space.n
        self.nr_of_states = environ.observation_space.n
        
        self.epsilon = epsilon
        self.epsilon_decay = self.epsilon / (n_episodes / 2)
        self.max_epsilon = max_epsilon
        self.lr = learning_rate
        self.gamma = discount_factor
        
        # in this case [0, 1] indicates state 0 with action 1
        if random:
            self.q_function = np.random.rand(self.nr_of_states, self.nr_of_actions)
        else:
            self.q_function = np.zeros((self.nr_of_states, self.nr_of_actions))
        
    def update(
        self,
        state,
        action,
        reward, 
        terminated,
        next_state
        ) -> None:
        # get the max_a q(s, a)
        greedy_action = np.max(self.q_function[next_state]) if not terminated else 0
        self.q_function[state, action] += self.lr * (reward + self.gamma * greedy_action - self.q_function[state, action])
    
    def epsilon_greedy(
        self,
        state
    ) -> np.int64:
        if np.random.random() < self.epsilon:
            action = self.env.action_space.sample()
        else:
            action = int(np.argmax(self.q_function[state]))
        return action 
    
    def decay_epsilon(self):
        return max(self.max_epsilon, self.epsilon - self.epsilon_decay)
        
    def predict(
        self,
        state
        ) -> np.int64:
        return int(np.argmax(self.q_function[state]))

In [54]:
from tqdm import tqdm

n_eps = 100
agent = QLearningAgent(environ=env, n_episodes=n_eps)

env = gym.wrappers.RecordEpisodeStatistics(env=env, buffer_length=n_eps)

for episode in tqdm(range(n_eps)):
    curr_state, info = env.reset()
    done = False
    
    while not done:
        action = agent.epsilon_greedy(curr_state)
        next_state, reward, terminated, truncated, info = env.step(action)
        
        agent.update(curr_state, action, reward, terminated, next_state)
        
        done = terminated or truncated
        curr_state = next_state
    
    agent.decay_epsilon()

  0%|          | 0/100000 [00:00<?, ?it/s]2024-10-19 00:13:17.819 python[54815:6647547] +[IMKClient subclass]: chose IMKClient_Legacy
2024-10-19 00:13:17.819 python[54815:6647547] +[IMKInputSession subclass]: chose IMKInputSession_Legacy
  0%|          | 0/100000 [00:37<?, ?it/s]


KeyboardInterrupt: 

: 