In [2]:
# !pip install -U catalyst-rl wandb 

### Import dependencies

In [3]:
import numpy as np
import catalyst_rl
from catalyst_rl.rl.environment import gym
import random
from tqdm import tqdm

### Create environment

In [7]:
env = gym.GymEnvWrapper("FrozenLake-v0").env

In [10]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [8]:
num_actions = env.action_space.n
print("Number of actions :", num_actions)

Number of actions : 4


In [9]:
num_observations = env.observation_space.n
print("Number of observations :", num_observations)

Number of observations : 16


In [12]:
Q_table = np.zeros((num_observations, num_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


### Set parameters

In [75]:
num_episodes = 10000
epsilon = 1
lr = 0.9
gamma = 0.85
max_steps = 100

In [76]:
min_epsilon = 0.01
max_epsilon = 1
decay_rate = 0.005

### Learning process

In [77]:
for episode in tqdm(range(num_episodes)):

  state = env.reset()
  done = False

  for step in range(max_steps):

    epsilon_tradeoff = random.uniform(0, 1)
    if epsilon_tradeoff > epsilon: # explotation
      action = np.argmax(Q_table[state, :])
    else: # exploration
      action = env.action_space.sample()

    new_state, reward, done, info = env.step(action)

    Q_table[state, action] = Q_table[state, action] + lr * (reward + gamma * np.max(Q_table[new_state, :]) - Q_table[state, action])

    state = new_state

    if done:
      break

  epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

100%|██████████| 10000/10000 [00:11<00:00, 888.81it/s]


In [52]:
Q_table

array([[2.53828649e-01, 1.39817414e-01, 1.80486150e-02, 1.12989107e-01],
       [2.39467631e-03, 1.94417179e-01, 2.45314479e-03, 7.90615945e-02],
       [5.22720395e-03, 5.05298376e-03, 2.72534444e-02, 1.21464822e-01],
       [2.06817812e-03, 1.83216947e-03, 4.46669255e-04, 3.43730190e-02],
       [4.43692717e-01, 1.08002209e-02, 7.44432318e-02, 8.30921324e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.22133646e-02, 6.22283586e-08, 1.18288341e-05, 2.16353003e-06],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.10443544e-02, 7.90887816e-03, 6.28819395e-04, 5.16120107e-01],
       [4.38395934e-03, 7.62438795e-01, 2.84514977e-03, 1.80739219e-02],
       [8.96259757e-01, 8.14725802e-04, 1.51988677e-03, 3.11063513e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.77472720e-02, 1.57007369e-01, 9.30792001e

### Demo 

In [78]:
env.reset()

for episode in range(5):

  state = env.reset()
  done = False
  total_reward = 0

  for step in range(max_steps):

    action = np.argmax(Q_table[state, :])
    new_state, reward, done, info = env.step(action)
    total_reward += reward
    new_state = state

    if done:
      env.render()
      print(f"Total reward is {total_reward}")
      break

env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Total reward is 1.0
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Total reward is 0.0
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
Total reward is 0.0
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Total reward is 0.0
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Total reward is 0.0
