In [1]:
# Imports
import gym
import numpy as np

In [2]:
# Task 0 Monte Carlo
def run_episode(env, max_steps, policy):
    """
    Runs an episode of the environment
    Args:
        env: the openAI environment instance
        max_steps: maximum number of steps per episode

    Return:
        episode_results: np.ndarray of integers shape (state, reward)
    """
    state = env.reset()
    episode_results = []

    # Run each episode until we reach max_steps
    for step in range(max_steps):
        action = policy(state)
        next_state, reward, done, _ = env.step(action)
        episode_results.append([state, reward])
        if done:
            break

        state = next_state

    return np.array(episode_results, dtype=int)


def monte_carlo(env, V, policy, episodes=5000, max_steps=100, alpha=0.1,
                gamma=0.99):
    """
    Args:
        env: the openAI environment instance
        V: np.ndarray shape (s,) containing the value estimate
        policy: a function that takes in a state and returns the next action
        episodes: total number of episodes to train over
        max_steps: maximum number of steps per episode
        alpha: learning rate
        gamma: discount rate

    Returns:
        V: the updated value estimate
    """
    # Loop through our episodes
    for episode in range(episodes):
        cumulative_reward = 0
        episode_results = run_episode(env, max_steps, policy)
        # Perform Monte Carlo Algorithm from finish to start
        for time in reversed(range(0, len(episode_results))):
            state, reward = episode_results[time]
            cumulative_reward = gamma * cumulative_reward + reward
            if state not in episode_results[:episode, 0]:
                V[state] = V[state] + alpha * (cumulative_reward - V[state])

    return V

In [3]:
# 0-main
np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=4)
env.seed(0)
print(monte_carlo(env, V, policy).reshape((8, 8)))

[[ 0.81    0.9     0.4783  0.4305  0.3874  0.4305  0.6561  0.9   ]
 [ 0.9     0.729   0.5905  0.4783  0.5905  0.2824  0.2824  0.3874]
 [ 1.      0.5314  0.729  -1.      1.      0.3874  0.2824  0.4305]
 [ 1.      0.5905  0.81    0.9     1.     -1.      0.3874  0.6561]
 [ 1.      0.6561  0.81   -1.      1.      1.      0.729   0.5314]
 [ 1.     -1.     -1.      1.      1.      1.     -1.      0.9   ]
 [ 1.     -1.      1.      1.     -1.      1.     -1.      1.    ]
 [ 1.      1.      1.     -1.      1.      1.      1.      1.    ]]


In [4]:
# Task 1 - TD(λ)
def td_lambtha(env, V, policy, lambtha, episodes=5000, max_steps=100,
               alpha=0.1, gamma=0.99):
    """
    Args:
        env: the openAI environment instance
        V: numpy.ndarray of shape (s,) containing the value estimate
        policy: function that takes in state and returns  next action to take
        lambtha: eligibility trace factor
        episodes: total number of episodes to train over
        max_steps: maximum number of steps per episode
        alpha: learning rate
        gamma: discount rate
    Returns:
        V: the updated value estimate
    """
    eligibility_trace = np.zeros_like(V)
    for episode in range(episodes):
        state = env.reset()
        env.seed(0)

        for step in range(max_steps):
            action = policy(state)
            next_state, reward, done, _ = env.step(action)

            # TD error
            delta = reward + (gamma * V[next_state] - V[state])

            # Update eligibility trace
            eligibility_trace *= (gamma * lambtha)
            eligibility_trace[state] += 1

            # Update value estimate
            V += delta * alpha * eligibility_trace

            state = next_state

            if done or step > max_steps:
                break

    return V

In [10]:
# 1-main
np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=4)
print(td_lambtha(env, V, policy, 0.9).reshape((8, 8)))


[[-0.8455 -0.8419 -0.8327 -0.7606 -0.7426 -0.7286 -0.5796 -0.7167]
 [-0.8805 -0.8791 -0.879  -0.82   -0.777  -0.736  -0.6973 -0.6678]
 [-0.8993 -0.9341 -0.9593 -1.     -0.8285 -0.8098 -0.8266 -0.8228]
 [-0.93   -0.9498 -0.953  -0.9714 -0.8712 -1.     -0.8732 -0.86  ]
 [-0.948  -0.9685 -0.9772 -1.     -0.6707 -0.7231 -0.9086 -0.8171]
 [-0.9195 -1.     -1.      0.2853 -0.7859 -0.6773 -1.     -0.4291]
 [-0.9397 -1.     -0.4064 -0.1074 -1.     -0.2633 -1.     -0.1985]
 [-0.8736 -0.9177 -0.7974 -1.      1.      0.354   0.9571  1.    ]]


In [6]:
# Task 2 Sarsa(λ)
def sarsa_lambtha(env, Q, lambtha, episodes=5000, max_steps=100, alpha=0.1,
                  gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):
    """
    Args:
        env: the openAI environment instance
        Q: numpy.ndarray of shape (s,a) containing the Q table
        lambtha: eligibility trace factor
        episodes: total number of episodes to train over
        max_steps: maximum number of steps per episode
        alpha: learning rate
        gamma: discount rate
        epsilon: initial threshold for epsilon greedy
        min_epsilon: minimum value that epsilon should decay to
        epsilon_decay: decay rate for updating epsilon between episodes

    Returns:
        Q: the updated Q table
    """
    eligibility_trace = np.zeros_like(Q)
    for episode in range(episodes):
        state = env.reset()
        episode_done = False

        action = epsilon_greedy(Q, state, epsilon)

        epsilon = max(min_epsilon, epsilon - epsilon_decay)

        for step in range(max_steps):
            n_s, reward, episode_done, _ = env.step(action)
            n_a = epsilon_greedy(Q, n_s, epsilon)

            # Calculate TD error
            td_error = reward + gamma * Q[n_s][n_a] - Q[state][action]

            # Update eligibility trace
            eligibility_trace *= lambtha * gamma
            eligibility_trace[state][action] = 1.0

            # Update Q values
            Q += alpha * td_error * eligibility_trace

            state = n_s
            action = n_a

            if episode_done:
                break

    return Q


def epsilon_greedy(Q, state, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(len(Q[state]))
    else:
        return np.argmax(Q[state])

In [7]:
#2-main
np.random.seed(0)
env = gym.make('FrozenLake8x8-v0')
Q = np.random.uniform(size=(64, 4))
np.set_printoptions(precision=4)
print(sarsa_lambtha(env, Q, 0.9))

[[0.6262 0.6306 0.6409 0.6374]
 [0.6244 0.5979 0.6378 0.6168]
 [0.6409 0.6011 0.6005 0.598 ]
 [0.6266 0.6337 0.6826 0.6291]
 [0.6694 0.696  0.6841 0.6533]
 [0.6883 0.7174 0.683  0.7012]
 [0.566  0.7253 0.6443 0.6436]
 [0.7239 0.6588 0.5858 0.6408]
 [0.6305 0.6866 0.606  0.6411]
 [0.6003 0.6491 0.6356 0.6121]
 [0.6106 0.684  0.6148 0.6313]
 [0.4618 0.5625 0.5724 0.7058]
 [0.6766 0.662  0.7325 0.6508]
 [0.7068 0.7574 0.7088 0.6908]
 [0.7501 0.6216 0.7347 0.5917]
 [0.3948 0.4392 0.7269 0.452 ]
 [0.692  0.6922 0.6895 0.6698]
 [0.6548 0.6311 0.6104 0.6594]
 [0.6931 0.5376 0.5318 0.4434]
 [0.2828 0.1202 0.2961 0.1187]
 [0.5979 0.5127 0.779  0.5929]
 [0.7632 0.8006 0.7593 0.665 ]
 [0.7541 0.8298 0.7109 0.7564]
 [0.5809 0.8178 0.6943 0.68  ]
 [0.7085 0.7075 0.7249 0.6506]
 [0.6982 0.6414 0.6864 0.6936]
 [0.7303 0.6951 0.6783 0.6749]
 [0.6479 0.793  0.5564 0.6667]
 [0.7637 0.7129 0.7753 0.8138]
 [0.8811 0.5813 0.8817 0.6925]
 [0.8722 0.734  0.763  0.7765]
 [0.7623 0.8662 0.6942 0.6584]
 [0.7327