In [39]:
import numpy as np
import gym
import random

In [40]:
env = gym.make("FrozenLake-v0")

In [41]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [42]:
qtable = np.zeros((state_size, action_size))
qtable

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [47]:
## hyperparameters
total_episodes = 15000
max_steps = 99 # max steps per episode
learning_rate = 0.8
gamma = 0.95 #discounting rate

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

In [48]:
rewards = []

for episodes in range(total_episodes):
    
    # reset environment
    state = env.reset()
    done = False
    steps = 0
    total_rewards = 0
    
    for step in range(max_steps):
        
        exp_exp_tradeoff = random.uniform(0,1)
        
        if exp_exp_tradeoff < epsilon:
            action = env.action_space.sample() # explore
        else:
            action = np.argmax(qtable[state,:]) # exploit
            
        new_state, reward, done, info = env.step(action)
        
        #update q table
        qtable[state,action] = qtable[state, action] + learning_rate*(reward + gamma*np.max(qtable[new_state,:]) 
                                                                      - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True:
            break
            
    #  update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episodes) 
    rewards.append(total_rewards)
        
print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.4852666666666667
[[3.59563093e-02 4.66029843e-02 9.81608872e-02 4.82075230e-02]
 [1.65287192e-03 1.36310701e-02 3.98127247e-03 4.83956564e-02]
 [3.47478367e-03 1.76169798e-03 9.15789401e-03 1.37949846e-02]
 [2.82366997e-03 3.01601037e-05 5.11591341e-03 9.71320878e-03]
 [3.27726692e-01 8.94384833e-02 4.49065454e-02 8.35866591e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.01366840e-06 8.49234995e-07 5.38160762e-01 3.48204392e-08]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.50923434e-03 8.60954084e-02 8.62034623e-03 3.58587626e-01]
 [1.64329104e-02 4.16220873e-01 4.60703941e-05 5.68728247e-03]
 [8.41859385e-01 6.32162907e-05 5.25198796e-03 5.73576688e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.17921666e-01 1.43042838e-01 7.81446620e-01 9.35004428e-02]
 [3.47123053e-01 8.97301724e-01 2.53866073e-01 1.98837174e-01]
 [0.00000000e+00 0.

## Test

In [49]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("**********Episode %s**************" % (episode))
    
    for step in range(max_steps):
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            print("Number of steps", step)
            break
        state = new_state
env.close()

**********Episode 0**************
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 8
**********Episode 1**************
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 57
**********Episode 2**************
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 6
**********Episode 3**************
**********Episode 4**************
