In [1]:
import numpy as np
import gym
import random

env = gym.make("FrozenLake-v0")

action_size = env.action_space.n
state_size = env.observation_space.n

In [2]:
# Feel free to play with these hyperparameters

total_episodes = 15000        # Total episodes
test_episodes = 10            # Test episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [3]:
# Initializations
qtable = np.zeros((state_size, action_size))
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action a in the current state (greedy or explore)
        
        exp_exp_tradeoff = random.uniform(0, 1)  
        # exploitation (taking the max Q value for this state)
        if exp_exp_tradeoff > epsilon:
            # Enter code here
            ## Hint: Greedily choose an action according to Q value

            ############################################################
            # Select the maximum q-value for this action.
            action = np.argmax(qtable[state, :])
            ############################################################
        # exploration
        else:
            # Enter code here
            ## Hint: Randomly choose an action
            
            ############################################################
            # Draw random action.
            action = env.action_space.sample()
            ############################################################

        # Take this action and observe
        new_state, reward, done, info = env.step(action)

        # Do a Q update
        # Enter code here
        ## Hint: One line update equation convert to one line code, start with "qtable[state, action] = ..."
        
        ################################################################
        # Use equation 21.8 in the textbook.
        # Alternative is on slide 81 in the RL lecture slides. 
        qtable[state, action] = \
            qtable[state, action] \
            + learning_rate \
            * (
                    reward + gamma * qtable[new_state, :].max()
                    - qtable[state, action]
            )
        ################################################################
        
        total_rewards += reward
        
        state = new_state
        
        if done == True: 
            break
        
    # Decay epsilon to reduce exploration as time progresses
    
    # Enter code here to assign a decay value to "decay_parameter"
    
    ## Hint: 
    ## 1. Use inbuilt polynomial, exponential(, or whatever works) functions to decay epsilon
    ## 2. "decay_parameter" is a function of "decay_rate" and "episode"
    
    ####################################################################
    decay_parameter = np.exp(-decay_rate * episode)
    ####################################################################
    
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*decay_parameter
    rewards.append(total_rewards)

print("Score over time: " +  str(sum(rewards)/total_episodes))
print("Q values:")
print(qtable)

Score over time: 0.4784
Q values:
[[1.70960736e-01 7.84656422e-02 4.03437819e-02 4.26729338e-02]
 [8.51317642e-04 1.13299422e-02 1.33404103e-02 8.34475458e-02]
 [7.08959062e-03 8.46193600e-03 1.51810364e-03 6.34448288e-02]
 [5.37631852e-02 4.81406047e-02 1.10588828e-03 5.01230669e-02]
 [3.87158952e-01 5.87905556e-02 2.35217930e-02 3.99615984e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.46494239e-02 1.33979318e-04 1.79462347e-06 2.94543165e-08]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.97133860e-02 2.90103970e-02 2.34371097e-02 5.56889079e-01]
 [3.80910044e-05 6.90375741e-01 1.84006665e-02 6.27983723e-03]
 [5.62503321e-01 2.01162548e-04 5.29689412e-04 1.49755906e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.83682534e-02 6.46716289e-03 8.79036587e-01 2.67395026e-02]
 [1.38560907e-01 9.34660548e-01 3.85060375e-01 1.47486382e-01]
 [0.00000000e+00 0.00

Q1. In short, explain why fixed "epsilon" above isn't the best choice? (Hint: You can keep epsilon fixed and see whether your reasoning explains the behavior)


Answer:

As time goes on, the algorithm moves toward the optimal policy. If we fix epsilon, our explotation/exploration ratio will be fixed in the long run. With a fixed epsilon, the whole space will be explored, but we'll take more suboptimal actions than is necessary. By decaying epsilon, we reduce exploration as time goes on, which allows us to converge to the optimal policy faster.

In [4]:
########################################################################
#################### Final policy animation ############################
########################################################################

print("We only print the last state in each episode, to see if our agent has reached the destination or fallen into a hole")
env.reset()

for episode in range(test_episodes):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # Taking action with Q learning
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            
            print("Number of steps", step)
            break
        state = new_state
env.close()

We only print the last state in each episode, to see if our agent has reached the destination or fallen into a hole
****************************************************
EPISODE  0
****************************************************
EPISODE  1
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 20
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 65
****************************************************
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 43
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 56
****************************************************
EPISODE  5
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 5
****************************************************
EPISODE  6
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 21
****************************************************
EPISODE  7
  (Down)
SFFF
FHFH
FFFH
HFF[41

Q2. In some episodes above, the policy isn't reaching the goal, why?

Answer:

Unlike in the "PI" code, the lake here is "slippery." In other words, there's a chance our actions don't take us where we want to go. So while the final policy is optimal, there will be times when the agent falls in a hole due to the non-deterministic nature of the frozen lake. 