In [None]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/953.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.1/953.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m716.8/953.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m952.3/953.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing colle

In [None]:
import numpy as np
import gymnasium as gym

In [95]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True,render_mode='ansi')#rgb_array for recording video
n_observations = env.observation_space.n
n_actions = env.action_space.n

In [None]:
print('Number of States',n_observations)
print('Number of possible actions',n_actions)

Number of States 16
Number of possible actions 4


In [None]:
#Initialize the Q-table to 0
Q_table = np.zeros((n_observations,n_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [None]:
Q_table.shape #16 States with 4 actions each

(16, 4)

In [None]:
Q_table[9,:]

array([0., 0., 0., 0.])

In [54]:
#number of episode we will run
n_episodes = 10000

#maximum of iteration per episode- Or number of steps per episode
steps_allowed = 100

#initialize the exploration probability to 1
epsilon = 1

#exploartion decreasing decay for exponential decreasing
decay_rate = 0.001

# minimum of exploration proba
min_epsilon = 0.01

#max exploration rate
max_epsilon=1
#discounted factor
gamma = 0.99

#learning rate
lr = 0.1

In [60]:
#Storing rewards after each episode in a list
rewards_per_episode = list()

In [61]:
#we iterate over episodes
for e in range(n_episodes):
  #we initialize the first state of the episode
  state = env.reset()[0]
  done = False

  #sum the rewards that the agent gets from the environment
  total_reward = 0

  for i in range(steps_allowed):
    # epsilon greedy strategy
    # we initiate a random number between 0 and 1
    # if the random_number is less than the exploration proba(epsilon)
    #     the agent explores
    # else
    #     he exploits his knowledge

    if np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state,:])

    # The environment runs the chosen action and returns
    # the next state, a reward and true if the episode is ended.
    next_state, reward, done, truncated , info = env.step(action)

    # We update our Q-table using the Q-learning iteration
    Q_table[state, action] = (1-lr) * Q_table[state, action] \
                       + lr*(reward + gamma* np.max(Q_table[next_state,:]))
    total_reward = total_reward + reward

    state = next_state
    # If the episode is finished, we leave the for loop
    if done:
        break

  #We update the exploration proba using exponential decay formula
  epsilon = exploration_rate = min_epsilon + \
    (max_epsilon - min_epsilon) * np.exp(-decay_rate*e)
  rewards_per_episode.append(total_reward)

In [57]:
len(rewards_per_episode)

10000

In [62]:
rewards_per_thousand_episodes=np.split(np.array(rewards_per_episode),n_episodes/1000)

In [63]:
count=1000
print('-----Average reward per thousand episodes-------')
for r in rewards_per_thousand_episodes:
  print(count,':' ,str(sum(r/1000)))
  count+=1000

-----Average reward per thousand episodes-------
1000 : 0.04200000000000003
2000 : 0.20300000000000015
3000 : 0.3780000000000003
4000 : 0.5750000000000004
5000 : 0.6300000000000004
6000 : 0.6730000000000005
7000 : 0.6610000000000005
8000 : 0.7090000000000005
9000 : 0.6870000000000005
10000 : 0.6570000000000005


In [65]:
print('-----------Updated Q Table-----------')
print(Q_table)

-----------Updated Q Table-----------
[[0.55816017 0.51326364 0.5028423  0.50604008]
 [0.27820367 0.1831768  0.39411253 0.48237345]
 [0.38661055 0.3967128  0.40376709 0.44268877]
 [0.22959311 0.34135189 0.25068877 0.43061473]
 [0.58067693 0.34620329 0.41046693 0.27101601]
 [0.         0.         0.         0.        ]
 [0.39993663 0.18053348 0.17510166 0.1748732 ]
 [0.         0.         0.         0.        ]
 [0.44370072 0.28516859 0.33825792 0.6198048 ]
 [0.38185915 0.67847138 0.41207317 0.46163916]
 [0.72810727 0.33538246 0.39559581 0.2932375 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.52375492 0.52901824 0.75142331 0.49779007]
 [0.698204   0.90818927 0.768915   0.75351646]
 [0.         0.         0.         0.        ]]


In [67]:
import time
from IPython.display import clear_output

In [92]:
#For recording video
%pip install gymnasium[classic_control] comet_ml
import comet_ml
comet_ml.init(project_name="frozen_lake")
env = gym.wrappers.RecordVideo(env, 'gameplay video') #fdiox3CS3j2WHY7yzjJclVgbt

Collecting comet_ml
  Downloading comet_ml-3.33.9-py3-none-any.whl (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.7/559.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting python-box<7.0.0 (from comet_ml)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests-toolbelt>=0.8.0 (from comet_ml)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting semantic-version>=2.8.0 (from comet_ml)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting sentry-sdk>=1.1.0 (from comet_ml)
  Downloading sentry_sdk-1.29.2-py2.py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.6/215

In [97]:
#Visualising the game
for episode in range(3):
  state=env.reset()[0]
  done=False
  print('----------EPISODE:',episode+1,'---------\n\n\n\n')
  time.sleep(1)

  for step in range(steps_allowed):
    clear_output(wait=True)
    print(env.render())
    time.sleep(0.4)

    action = np.argmax(Q_table[state,:])
    new_state, reward, done, truncated, info = env.step(action)

    if done:
      clear_output(wait=True)
      print(env.render())
      if reward == 1:
          print("****You reached the goal!****")
          time.sleep(3)
      else:
          print("****You fell through a hole!****")
          time.sleep(3)
      clear_output(wait=True)
      break

    state=new_state

env.close()

[[[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]

 [[180 200 230]
  [204 230 255]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 ...

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [235 245 249]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]]
