# Frozen Lake OpenAI gym case solved in Medium

In [21]:
import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt

In [22]:
env = gym.make('FrozenLake-v0')

[2017-07-15 19:41:25,252] Making new env: FrozenLake-v0


## No NN approach (does not scale)

In [23]:
# First create the table with: rows = states (in this case 16, 4x4 terrain),
# columns = possible actions (in this case 4, one for every direction)

# Initialize table with all zeros
Q = np.zeros([env.observation_space.n,env.action_space.n])

# Learning parameteres
lr = .8 # Q(s,a) = r + γ(max(Q(s’,a’)) (the gamma)
y = .95 
num_episodes = 2000

#create lists to contain total rewards and steps per episode
# jList = []
rList = []

for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)

In [24]:
print("Score over time: " +  str(sum(rList)/num_episodes))

Score over time: 0.4355


In [25]:
print("Final Q-Table Values")
print(Q)

Final Q-Table Values
[[  5.69824609e-02   9.56079385e-03   6.23683577e-03   7.25468905e-03]
 [  2.28775592e-05   1.50500663e-03   1.56098683e-05   8.46382582e-02]
 [  2.09771280e-03   3.10270654e-03   3.21286477e-03   9.82454617e-02]
 [  1.25675390e-04   5.71346129e-04   5.91823767e-04   4.38952424e-02]
 [  7.21507801e-02   3.09250059e-04   4.08706368e-04   1.43835431e-04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  8.09848078e-02   1.75618536e-04   1.01483126e-04   1.15116283e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  7.80669547e-04   5.75219160e-04   1.09321510e-03   1.65874534e-01]
 [  0.00000000e+00   1.50661478e-01   2.84299942e-03   3.61223479e-03]
 [  4.48531875e-01   0.00000000e+00   3.77270511e-04   5.30281902e-04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   5.25160383e-01   0

## Q-Network approach

In [26]:
tf.reset_default_graph()

In [27]:
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1,16],dtype=tf.float32) # 1x16 ONE-HOT encoded tensor (basically the current state)
W = tf.Variable(tf.random_uniform([16,4],0,0.01))
Qout = tf.matmul(inputs1,W)
predict = tf.argmax(Qout,1)

#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[1,4],dtype=tf.float32) # 1x4 tensor with the estimated Q-values for each action
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

## Training the Network

In [31]:
init = tf.global_variables_initializer()

# Set learning parameters
y = .99
e = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        #Reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Network
        while j < 99:
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            a,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.identity(16)[s:s+1]})
            if np.random.rand(1) < e:
                a[0] = env.action_space.sample()
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a[0])
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout,feed_dict={inputs1:np.identity(16)[s1:s1+1]})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0,a[0]] = r + y*maxQ1
            #Train our network using target and predicted Q values
            _,W1 = sess.run([updateModel,W],feed_dict={inputs1:np.identity(16)[s:s+1],nextQ:targetQ})
            rAll += r
            s = s1
            if d == True:
                #Reduce chance of random action as we train the model.
                e = 1./((i/50) + 10)
                break
        jList.append(j)
        rList.append(rAll)
print("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")

Percent of succesful episodes: 0.205%
