# Q-Learning with FrozenLake-v1 ⛄ and Taxi-v3 🚕

In this notebook we're going to code Q-Learning algorithm from scrath and experiment different configurations.

- this notenook is part of Hands-On for unit2 of Hugging-Face [Deep RL course](https://huggingface.co/deep-rl-course/unit2/introduction?fw=pt) 

### 🎮 Environments: 

- [FrozenLake-v1](https://www.gymlibrary.dev/environments/toy_text/frozen_lake/)
- [Taxi-v3](https://www.gymlibrary.dev/environments/toy_text/taxi/)


## Libraries

In [3]:
!pip install -q gym==0.24
!pip install -q pygame
!pip install -q numpy

!pip install -q pickle5
!pip install -q pyyaml==6.0
!pip install -q imageio
!pip install -q imageio_ffmpeg
!pip install -q pyglet==1.5.1
!pip install -q tqdm

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/694.4 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m686.1/694.4 KB[0m [31m24.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m694.4/694.4 KB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.8/21.8 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━

In [4]:
%%capture
!sudo apt-get update
!apt install python-opengl ffmpeg xvfb
!pip3 install pyvirtualdisplay

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7fbfda769a30>

In [2]:
import numpy as np
import gym
import random
import imageio
import tqdm

from tqdm.notebook import tqdm 



## Q-Learning class

In [26]:
class Q_Learning():
  def __init__(self, env, learning_rate = 0.7, gamma = 0.95):
    self.env = env
    self.state_space = env.observation_space.n
    self.action_space = env.action_space.n
    self.Q_table = np.zeros((self.state_space, self.action_space))

    # Training parameters
    self.learning_rate = learning_rate       # Learning rate
    self.gamma = gamma                       # discount factor

  def greedy_policy(self, Q_table, state):
    action = np.argmax(Q_table[state][:])
    return action

  def e_greedy_policy(self, Q_table, state, epsilon):
    random_num = np.random.uniform(0, 1)
    if random_num > epsilon:
      action = np.argmax(Q_table[state][:])
    else:
      action = self.env.action_space.sample()
    return action
  
  def train(self, n_training_episodes, min_epsilon, max_epsilon, decay_rate,  max_steps):
    print("Starting to train agent ...")
    for episode in tqdm(range(n_training_episodes)):
      # Reduce epsilon (because we need less and less exploration)
      epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
      # Reset the environment
      state = self.env.reset()
      step = 0
      done = False
      # repeat
      for step in range(max_steps):
        # Choose the action At using epsilon greedy policy
        action = self.e_greedy_policy(self.Q_table, state, epsilon)
    
        # Take action At and observe Rt+1 and St+1
        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = self.env.step(action)
        #print(state, new_state, reward, done, info)

        new_action = self.greedy_policy(self.Q_table, new_state)
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        self.Q_table[state][action] = self.Q_table[state][action] + self.learning_rate * (reward + self.gamma * np.max(self.Q_table[new_state][new_action]) - self.Q_table[state][action])
        
        if done:
          break

        state = new_state

    return self.Q_table


  def evaluate(self, max_steps, n_eval_episodes, seed):
      """
      Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
      :param env: The evaluation environment
      :param n_eval_episodes: Number of episode to evaluate the agent
      :param Q: The Q-table
      :param seed: The evaluation seed array (for taxi-v3)
      """
      print("Starting to evaluate agent ...")
      episode_rewards = []
      for episode in tqdm(range(n_eval_episodes)):
        if seed:
          state = self.env.reset(seed=seed[episode])
        else:
          state = self.env.reset()
        step = 0
        done = False
        total_rewards_ep = 0
        
        for step in range(max_steps):
          # Take the action (index) that have the maximum expected future reward given that state
          action = self.greedy_policy(self.Q_table, state)
          new_state, reward, done, info = self.env.step(action)
          total_rewards_ep += reward
            
          if done:
            break
          state = new_state
        episode_rewards.append(total_rewards_ep)
      mean_reward = np.mean(episode_rewards)
      std_reward = np.std(episode_rewards)

      return mean_reward, std_reward


## function for generating render from test results

In [4]:
def record_video(env, Qtable, out_directory, fps=1):
  """
  Generate a replay video of the agent
  :param env
  :param Qtable: Qtable of our agent
  :param out_directory
  :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
  """
  images = []  
  done = False
  state = env.reset(seed=random.randint(0,500))
  img = env.render(mode='rgb_array')
  images.append(img)
  while not done:
    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(Qtable[state][:])
    state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
    img = env.render(mode='rgb_array')
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

## Q-Learning in Action

### Frozen Lake

#### frozen lake, 4x4, No Slippy:


In [56]:
# Training parameters
n_training_episodes = 10000  # Total training episodes

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability 
decay_rate = 0.0005            # Exponential decay rate for exploration prob

In [57]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False) 
Q_Learning_agent = Q_Learning(env)
# Train our Agent
Q_table = Q_Learning_agent.train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps)
# Evaluate our Agent
mean_reward, std_reward = Q_Learning_agent.evaluate(max_steps, n_eval_episodes, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Starting to train agent ...


  0%|          | 0/10000 [00:00<?, ?it/s]

Starting to evaluate agent ...


  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=1.00 +/- 0.00


In [7]:
record_video(env, Q_table, "frozen_4x4_noSlippery_render.mp4", fps=1)

#### frozen lake, 4x4, Slippy

In [110]:
# Training parameters
n_training_episodes = 2000  # Total training episodes

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
max_steps = 99               # Max steps per episode
gamma = 0.99                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability 
decay_rate = 0.001            # Exponential decay rate for exploration prob

learning_rate= 0.25



In [111]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True) 
Q_Learning_agent = Q_Learning(env, learning_rate = learning_rate, gamma = gamma)
# Train our Agent
Q_table = Q_Learning_agent.train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps)
# Evaluate our Agent
mean_reward, std_reward = Q_Learning_agent.evaluate(max_steps, n_eval_episodes, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Starting to train agent ...


  0%|          | 0/2000 [00:00<?, ?it/s]

Starting to evaluate agent ...


  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=0.78 +/- 0.41


In [113]:
record_video(env, Q_table, "frozen_4x4_Slippery_render.mp4", fps=1)

#### frozen lake, 8x8, noSlippy

In [114]:
# Training parameters
n_training_episodes = 10000  # Total training episodes

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
max_steps = 299               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability 
decay_rate = 0.00005            # Exponential decay rate for exploration prob

learning_rate = 0.7

In [115]:
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=False) 
Q_Learning_agent = Q_Learning(env)
# Train our Agent
Q_table = Q_Learning_agent.train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps)
# Evaluate our Agent
mean_reward, std_reward = Q_Learning_agent.evaluate(max_steps, n_eval_episodes, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")


Starting to train agent ...


  0%|          | 0/10000 [00:00<?, ?it/s]

Starting to evaluate agent ...


  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=1.00 +/- 0.00


In [116]:
record_video(env, Q_table, "frozen_8x8_noSlippery_render.mp4", fps=1)

#### frozen lake, 8x8, Slippy

In [193]:
# Training parameters
n_training_episodes = 20000  # Total training episodes

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
max_steps = 99               # Max steps per episode
gamma = 0.99                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 0.8             # Exploration probability at start
min_epsilon = 0.15           # Minimum exploration probability 
decay_rate = 0.00001            # Exponential decay rate for exploration prob

learning_rate= 0.35

In [194]:
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=True) 
Q_Learning_agent = Q_Learning(env, learning_rate= learning_rate)
# Train our Agent
Q_table = Q_Learning_agent.train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps)
# Evaluate our Agent
mean_reward, std_reward = Q_Learning_agent.evaluate(max_steps, n_eval_episodes, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")


Starting to train agent ...


  0%|          | 0/20000 [00:00<?, ?it/s]

Starting to evaluate agent ...


  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=0.34 +/- 0.47


In [197]:
record_video(env, Q_table, "frozen_8x8_Slippery_render.mp4", fps=1)

### Taxi

In [198]:
# Training parameters
n_training_episodes = 10000   # Total training episodes
learning_rate = 0.05           # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# DO NOT MODIFY EVAL_SEED
eval_seed = [16,54,165,177,191,191,120,80,149,178,48,38,6,125,174,73,50,172,100,148,146,6,25,40,68,148,49,167,9,97,164,176,61,7,54,55,
 161,131,184,51,170,12,120,113,95,126,51,98,36,135,54,82,45,95,89,59,95,124,9,113,58,85,51,134,121,169,105,21,30,11,50,65,12,43,82,145,152,97,106,55,31,85,38,
 112,102,168,123,97,21,83,158,26,80,63,5,81,32,11,28,148] # Evaluation seed, this ensures that all classmates agents are trained on the same taxi starting position
                                                          # Each seed has a specific starting state

# Environment parameters
env_id = "Taxi-v3"           # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05              # Minimum exploration probability 
decay_rate = 0.0001          # Exponential decay rate for exploration prob


In [199]:
env = gym.make("Taxi-v3")
Q_Learning_agent = Q_Learning(env, learning_rate = learning_rate, gamma = gamma)
# Train our Agent
Q_table = Q_Learning_agent.train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps)
# Evaluate our Agent
mean_reward, std_reward = Q_Learning_agent.evaluate(max_steps, n_eval_episodes, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Starting to train agent ...


  0%|          | 0/10000 [00:00<?, ?it/s]

Starting to evaluate agent ...


  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=7.56 +/- 2.71


In [200]:
record_video(env, Q_table, "taxi_render.mp4", fps=1)

