In [18]:
import numpy as np
import gym
import random

#env = gym.make("FrozenLake-v0")


from gym.envs.registration import register

register(
    id='D4x4-FrozenLake-v0',
    entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
    kwargs={'map_name': '8x8',
            'is_slippery': True})

env = gym.make('D4x4-FrozenLake-v0', desc=None, map_name=None)

action_size = env.action_space.n
state_size = env.observation_space.n


In [3]:
# Feel free to play with these hyperparameters

total_episodes = 15000        # Total episodes
test_episodes = 10            # Test episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.96                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [19]:
# Initializations
qtable = np.zeros((state_size, action_size))
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action a in the current state (greedy or explore)
        
        exp_exp_tradeoff = random.uniform(0, 1)  
        # exploitation (taking the max Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # exploration
        else:
            action = env.action_space.sample()

        # Take this action and observe
        new_state, reward, done, info = env.step(action)

        # Do a Q update
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True: 
            break
        
    # Decay epsilon to reduce exploration as time progresses
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print("Score over time: " +  str(sum(rewards)/total_episodes))
print("Q values:")
print(qtable)

Score over time: 0.0
Q values:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [21]:
########################################################################
#################### Final policy animation ############################
########################################################################

print("We only print the last state in each episode, to see if our agent has reached the destination or fallen into a hole")
env.reset()

for episode in range(test_episodes):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # Taking action with Q learning
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        #show the last state
        if step == max_steps - 1:
            env.render()

        if done:
            env.render()
            
            print("Number of steps", step)
            break
        state = new_state
env.close()

We only print the last state in each episode, to see if our agent has reached the destination or fallen into a hole
****************************************************
EPISODE  0
  (Left)
[41mS[0mFFFHFFF
FFFFFFHH
FFFFFFFF
FHFFFHFF
HFFFFHFF
FFFFFFFH
FFFFHFFF
FFFFFFFG
****************************************************
EPISODE  1
  (Left)
[41mS[0mFFFHFFF
FFFFFFHH
FFFFFFFF
FHFFFHFF
HFFFFHFF
FFFFFFFH
FFFFHFFF
FFFFFFFG
****************************************************
EPISODE  2
  (Left)
[41mS[0mFFFHFFF
FFFFFFHH
FFFFFFFF
FHFFFHFF
HFFFFHFF
FFFFFFFH
FFFFHFFF
FFFFFFFG
****************************************************
EPISODE  3
  (Left)
[41mS[0mFFFHFFF
FFFFFFHH
FFFFFFFF
FHFFFHFF
HFFFFHFF
FFFFFFFH
FFFFHFFF
FFFFFFFG
****************************************************
EPISODE  4
  (Left)
[41mS[0mFFFHFFF
FFFFFFHH
FFFFFFFF
FHFFFHFF
HFFFFHFF
FFFFFFFH
FFFFHFFF
FFFFFFFG
****************************************************
EPISODE  5
  (Left)
[41mS[0mFFFHFFF
FFFFFFHH
FFFFFFFF
FHFFF