# Notes

It terns out, the design of decay pattern of espilon plays a key role here in the algorithm, under certain decay pattern, it is simply impossible to converge. The following pattern seems to have a good performance:
   
   action = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)/(i/4+1)**2)

In [1]:
import gym
import numpy as np

# Game

In [2]:
env = gym.make("FrozenLake-v0")

[2017-11-16 02:51:14,385] Making new env: FrozenLake-v0


# Model

### Constants

In [3]:
ACTIONS = [] # Action space
STATES = [] # State space
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.n
epsilon = 0.9 # Expoit v.s. Explore, this number will be updated later as the learning progress
gamma = 0.95 # Decay, same as discount for time value of money
LR = 0.8 # Learning Rate
N_GAMES = 5000 #Training length
N_STEPS = 99 # How many updates each game have

### Q Table

In [4]:
Q = np.zeros((N_STATES, N_ACTIONS))

### Act

In [5]:
def act(s,i=0):
    action = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)/(i/4+1)**2)
    return action

### Learn

In [6]:
def learn(s, a, r, s_, done):
    r_p = Q[s, a] # Predicted reward
    if not done:
        r_t = r + gamma*np.max(Q[s_,:]) # Target reward
    else:
        r_t = r
    Q[s,a] +=  LR* (r_t - r_p)  
    return Q

# Session

In [7]:
for i in range(N_GAMES):
    s = env.reset()
    for j in range(N_STEPS):
        #env.render() 
        a = act(s,i)
        s_, r, done, _ = env.step(a)
        _ = learn(s, a, r, s_, done)
        s = s_
        if done:
            break
    

In [8]:
Q

array([[  3.37397301e-04,   3.95250602e-04,   1.71713222e-01,
          5.46205499e-04],
       [  5.27265992e-05,   2.24551222e-05,   2.56141987e-05,
          7.53492018e-02],
       [  2.04633802e-04,   2.38871299e-02,   0.00000000e+00,
          0.00000000e+00],
       [  0.00000000e+00,   8.09436356e-05,   0.00000000e+00,
          6.04084637e-02],
       [  2.06026357e-01,   0.00000000e+00,   0.00000000e+00,
          1.80199681e-04],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  1.17013525e-01,   1.30814550e-06,   1.36701983e-06,
          4.49315135e-07],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  1.72551089e-04,   4.21001495e-05,   0.00000000e+00,
          1.96750512e-01],
       [  4.03206639e-05,   2.60462228e-01,   5.52765010e-05,
          0.00000000e+00],
       [  1.37429973e-01,   1.26029826e-04,   0.00000000e+00,
          0.00000000e+00],
       [  0.00000000e

# Valuation

In [9]:
rewards = []
for i in range(100):
    s = env.reset()    
    for j in range(N_STEPS):
        
        a  = np.argmax(Q[s])
        s_, r, done, _ = env.step(a)
        #env.render()
        s = s_
        if done:         
            break
    rewards.append(r)
np.mean(rewards)

0.45000000000000001

In [10]:
np.mean(rewards)

0.45000000000000001