<h1> Frozen Lake (Slippery Version) </h1>

In [None]:
# Import libraries
import gymnasium as gym
import numpy as np
import imageio
import tqdm
import pickle5 as pickle
import random
from tqdm.notebook import tqdm

In [None]:
# creating environment
env = gym.make('FrozenLake-v1', map_name='4x4', is_slippery=True, render_mode='rgb_array')

In [None]:
# observation space size
print(env.observation_space)
# action space size
print(env.action_space.n)

In [None]:
Qtable_frozenlake = np.zeros((16, 4))

In [None]:
def greedy_policy(Qtable, state):
    action = np.argmax(Qtable[state][:])
    return action

def epsilon_greedy_policy(Qtable, state, epsilon):
    random_number = random.uniform(0,1)

    if random_number > epsilon:
        action = greedy_policy(Qtable, state)
    
    else:
        action = env.action_space.sample()

    return action

In [None]:
# Hyperparameters
n_training_episodes = 500000
learning_rate = 0.5

n_eval_episodes = 1000
env_id = 'FrozenLave-v1'
max_steps = 99
gamma = 0.999
eval_seed = []

max_epsilon = 1.0
min_epsilon = 0.05
decay_rate = 0.00075


In [None]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):

    for episode in tqdm(range(n_training_episodes)):
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

        state, info = env.reset()
        step = 0
        terminated = False
        truncated = False

        for step in range(max_steps):
            action = epsilon_greedy_policy(Qtable, state, epsilon)
            new_state, reward, terminated, truncated, info = env.step(action)
            Qtable[state][action] = Qtable[state][action] + learning_rate*(reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])

            if terminated or truncated:
                break

            state = new_state
    return Qtable 

In [None]:
# train
Qtable_frozenlaken = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake)

In [None]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
    episode_rewards = []

    for episode in tqdm(range(n_eval_episodes)):
        if seed:
            state, info = env.reset(seed=seed[episode])

        else:
            state, info = env.reset()
        
        step = 0
        truncated = False
        terminated = False
        total_rewards_ep = 0

        for steps in range(max_steps):
            action = greedy_policy(Q, state)
            new_state, reward, terminated, truncated, info = env.step(action)
            total_rewards_ep += reward

            if terminated or truncated:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward
        

In [None]:
# Evaluate the agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlaken, eval_seed)
print(mean_reward, std_reward)

<table>
  <tr>
    <th>learning rate</th>
    <th>decay_rate</th>
    <th>reward</th>
    <th>std</th>
  </tr>
  <tr>
    <td>0.7</td>
    <td>0.00075</td>
    <td>0.471</td>
    <td>0.416</td>
  </tr>
  <tr>
    <td>0.3</td>
    <td>0.00075</td>
    <td>0.223</td>
    <td>0.417</td>
  </tr>
  <tr>
    <td>0.5</td>
    <td>0.00075</td>
    <td>0.731</td>
    <td>0.443</td>
  </tr>
  <tr>
    <td>0.5</td>
    <td>0.0001</td>
    <td>0.655</td>
    <td>0.475</td>
  </tr>
  <tr>
    <td>0.5</td>
    <td>0.00005</td>
    <td>0.571</td>
    <td>0.493</td>
  </tr>
  <tr>
    <td>0.5</td>
    <td>0.0005</td>
    <td>0.108</td>
    <td>0.310</td>
  </tr>
</table>

<h4>Standard Deviation in 4X4 Slippery Version is not going down below 0.4 using QLearning Table</h4>