In [1]:
import gym
import numpy as np
import random
import time
from IPython import display

### Initialize Environment

In [2]:
env = gym.make('FrozenLake8x8-v0')

In [3]:
env.reset()
print(env.step(0))
env.render()

(0, 0.0, False, {'prob': 0.3333333333333333})
  (Left)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


### Create Q-table

In [4]:
#State is given by index in lake of 64 tiles: agent has 4 possible moves: up, down, left, right

num_actions = env.action_space.n
num_states = env.observation_space.n

q_table = np.zeros((num_states, num_actions))

### Initialize Hyperparameters

In [5]:
hyperparameters = {
    'max_steps_per_episode': 200,
    'learning_rate': 0.1,
    'discount_rate': 0.99,
    'exploration_rate': 1,
    'max_exploration_rate': 1,
    'min_exploration_rate': 0.01,
    'exploration_rate_decay': 0.0001,
    'cumulative_episodes': 0,
}

### Create training and testing functions

In [6]:
def train_agent(env, q_table, episodes, parameters):
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        rewards = 0
        
        for step in range(parameters['max_steps_per_episode']):
            # Explore or exploit, depending on the exploration rate; exploration should decrease over time,
            # and exploitation should increase over time.
            if random.uniform(0, 1) > parameters['exploration_rate']:
                # exploit
                action = np.argmax(q_table[state,:]) 
            else:
                # explore
                action = env.action_space.sample()
                
            # Do action
            new_state, reward, done, info = env.step(action)
            
            # Update Q-table
            current_value = q_table[state, action] # the current value weighted by the learning rate
            new_value = reward + parameters['discount_rate'] * np.max(q_table[new_state, :]) # the temporal difference target
            
            q_table[state, action] = current_value + parameters['learning_rate'] * (new_value - current_value)
            
            # Rest variables for next step
            state = new_state
            rewards += reward
            
            # End episode if finished
            if done:
                break
                
        # Print updates
        if episode % (episodes / 10) == 0:
            print(episode / episodes * 100, '%')
            
        # Update exploration rate
        parameters['exploration_rate'] = parameters['min_exploration_rate'] + (parameters['max_exploration_rate'] - parameters['min_exploration_rate']) * np.exp(-parameters['exploration_rate_decay'] * parameters['cumulative_episodes'])
        
        parameters['cumulative_episodes'] += 1

In [7]:
def test_agent(env, q_table):
    state = env.reset()
    done = False
    while not done:
        action = np.argmax(q_table[state, :])
        state, _reward, done, _info = env.step(action)
        time.sleep(0.25)
        display.clear_output(wait=True)
        env.render()

In [15]:
train_agent(env, q_table, 10000, hyperparameters)

0.0 %
10.0 %
20.0 %
30.0 %
40.0 %
50.0 %
60.0 %
70.0 %
80.0 %
90.0 %


In [9]:
test_agent(env, q_table)

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m


### Create evaluation function

In [16]:
def evaluate_agent(env, q_table, test_runs=100):
    scores = []
    steps = []
    for i in range(test_runs):
        state = env.reset()
        done = False
        step = 0
        while not done:
            action = np.argmax(q_table[state, :])
            state, reward, done, info = env.step(action)
            step += 1
            if done:
                scores.append(reward)
                steps.append(step)
                
    scores = np.array(scores)
    steps = np.array(steps)
    
    success_rate = scores.mean()
    average_steps = steps[scores == 1].mean()
    print('Success Rate: ', success_rate)
    print('Average Steps per Completion: ', average_steps)

In [17]:
evaluate_agent(env, q_table)

Success Rate:  0.85
Average Steps per Completion:  78.76470588235294
