In [1]:
# imports
import gym
import numpy as np

In [2]:
# Task 0. Load the Environment
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """
    Function that loads the pre-made FrozenLakeEnv from OpenAI's gym
    Args:
        desc: Either None or list of lists containing a custom description of
            the map to load for the environment
        man_name: either None or string containing the pre-made map to load
        *** If both desc and map_name are None, the environment will load a
            randomly generated 8x8 map ***
        is_slippery: boolean to determine if the ice is slippery

    Returns: the environment
    """
    environment = gym.make('FrozenLake-v1',
                           desc=desc,
                           map_name=map_name,
                           is_slippery=is_slippery,
                           render_mode="ansi")
    return environment

In [3]:
# 0-main
np.random.seed(0)
env = load_frozen_lake()
print(env.desc)
print(env.P[0][0])
env = load_frozen_lake(is_slippery=True)
print(env.desc)
print(env.P[0][0])
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
print(env.desc)
env = load_frozen_lake(map_name='4x4')
print(env.desc)

[[b'S' b'F' b'F' b'F' b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'F' b'F' b'H' b'F' b'F']
 [b'F' b'H' b'F' b'H' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'H' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'H' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'G']]
[(1.0, 0, 0.0, False)]
[[b'S' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'H' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'H']
 [b'F' b'F' b'F' b'F' b'F' b'H' b'F' b'H']
 [b'F' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'G']]
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 8, 0.0, True)]
[[b'S' b'F' b'F']
 [b'F' b'H' b'H']
 [b'F' b'F' b'G']]
[[b'S' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'H']
 [b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'G']]


In [4]:
# Initialize Q-table
def q_init(env):
    """
    Initialize the Q-table
    Args:
        env: The FrozenLakeEnv instance
    Returns:
        The Q-table as a numpy.ndarray of zeros
    """
    nb_states = env.observation_space.n
    nb_actions = env.action_space.n
    return np.zeros((nb_states, nb_actions))

In [5]:
# 1-main
env = load_frozen_lake()
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(is_slippery=True)
Q = q_init(env)
print(Q.shape)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(map_name='4x4')
Q = q_init(env)
print(Q.shape)

(64, 4)
(64, 4)
(9, 4)
(16, 4)


In [6]:
# Task 2. Epsilon Greedy
def epsilon_greedy(Q, state, epsilon):
    """
    Function that uses epsilon-greedy to determine the next action
    Args:
        Q: numpy.ndarray shape(state, action) containing the q-table
        state: current state
        epsilon: epsilon to use for the calculation
    
    Should sample p with numpy.random.uniform to determine if the algorithm
        should explore or exploit
    
    If exploring, should pick the next action with numpy.random.randint from
        all possible actions

    Returns:
        the next action index
    """
    # print(Q)
    p = np.random.uniform()
    # print("p = ", p)
    # print("epsilon = ", epsilon)
    # Explore
    if p < epsilon:
        action = np.random.randint(0, 4)
    # Exploit
    else:
        action = np.argmax(Q[state])
    return action

In [7]:
# 2-main
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
Q[7] = np.array([0.5, 0.7, 1, -1])
np.random.seed(0)
print(epsilon_greedy(Q, 7, 0.5))
np.random.seed(1)
print(epsilon_greedy(Q, 7, 0.5))

2
0


In [8]:
# Task 3. Q-learning
def train(env, Q, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99,
          epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):
    """
    Function that performs Q-learning
    Args:
        env: the FrozenLakeEnv instance
        Q: np.ndarray shape(state, action) containing the Q-table
        episodes: the total number of episodes to train over
        max_steps: maximum number of steps per episode
        alpha: learning rate
        gamma: discount rate
        epsilon: the initial threshold for epsilon greedy
        min_epsilon: the minimum value that epsilon should decay to
        epsilon_decay: decay rate for updating epsilon between episodes
    When the agent falls in a hole, the reward should be updated to -1
    Returns:
        Q: the updated Q-table
        total_rewards: list containing the rewards per episode
    """
    total_rewards = []

    # Loop through each episode
    for episode in range(episodes):
        # Reset all of our episode values
        state, _ = env.reset()
        episode_reward = 0

        # Loop through steps (stop at max_steps if we didn't fall or succeed)
        for step in range(max_steps):
            action = epsilon_greedy(Q, state, epsilon)
            # print("Action: ", action)

            new_state, reward, terminated, _, _ = env.step(action)
            # print("Episode: ", episode, "Step: ", step)
            # print("New State: ", new_state, "Reward: ", reward, "Terminated: ", terminated)

            if reward == 1.0 and terminated == True:
                episode_reward += reward
            if reward == 0.0 and terminated == True:
                episode_reward -= 1
                reward = -1
            if reward == 0.0 and step + 1 == max_steps:
                episode_reward += 0

            # Bellman Equation
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * max(Q[new_state]))

            state = new_state

            if terminated:
                break

        epsilon = min_epsilon + (1 - min_epsilon) * np.exp(-epsilon_decay * episode)
        total_rewards.append(episode_reward)

    # print(total_rewards)

    return Q, total_rewards


In [9]:
# 3-main
np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(Q)
split_rewards = np.split(np.array(total_rewards), 10)
for i, rewards in enumerate(split_rewards):
    print((i+1) * 500, ':', np.mean(rewards))

  if not isinstance(terminated, (bool, np.bool8)):


[[ 0.96059593  0.970299    0.95098488  0.96059396]
 [ 0.96059557 -0.77123208  0.0094072   0.37627228]
 [ 0.18061285 -0.1         0.          0.        ]
 [ 0.97029877  0.9801     -0.99999988  0.96059583]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.98009763  0.98009933  0.99        0.9702983 ]
 [ 0.98009922  0.98999782  1.         -0.99999952]
 [ 0.          0.          0.          0.        ]]
500 : 0.812
1000 : 0.88
1500 : 0.9
2000 : 0.9
2500 : 0.88
3000 : 0.844
3500 : 0.892
4000 : 0.896
4500 : 0.852
5000 : 0.928


In [14]:
# Task 4. Play
def play(env, Q, max_steps=100):
    """
    Function that has th trained agent play an episode
    Args:
        env: FrozenLakeEnv instance
        Q: numpy.ndarray shape (state, action) containing the trained Q-table
        max_steps: Maximum number of steps in the episode
    Each state of the board should be displayed via the console
    We should always exploit the Q-table
    Returns the total rewards for the episode
    """
    total_reward = 0
    state, _ = env.reset()
    print(env.render(), end='')
    for _ in range(max_steps):
        action = np.argmax(Q[state])
        state, reward, terminated, _, _ = env.step(action)
        total_reward += reward
        print(env.render(), end='')
        if terminated:
            break
    return total_reward
        

In [15]:
# 4-main
np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(play(env, Q))

  if not isinstance(terminated, (bool, np.bool8)):



[41mS[0mFF
FHH
FFG
  (Down)
SFF
[41mF[0mHH
FFG
  (Down)
SFF
FHH
[41mF[0mFG
  (Right)
SFF
FHH
F[41mF[0mG
  (Right)
SFF
FHH
FF[41mG[0m
1.0
