In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym

In [None]:
env = gym.make('FrozenLake-v0')

In [None]:
# Indicates the location of the player, as a one-hot vector
inputs = tf.placeholder(shape=[1,16], dtype=tf.float32)

# Each of the 16 states has 4 possible outcomes
weights = tf.Variable(tf.random_uniform([16,4], minval=0, maxval=0.1))
Q1 = tf.matmul(inputs, weights)

# Next state is chosen by maximising the dot product
prediction = tf.argmax(Q1, axis=1)

In [None]:
# Update the weights, by moving the row of the weights matrix
# slightly closer to Q2. 
Q2 = tf.placeholder(shape=[1,4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Q2 - Q1))
gdo = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updatedweights = gdo.minimize(loss)

In [None]:
gamma = 0.9
epsilon = 0.1
num_episodes = 1000

totalReward = 0

# success_array the success of each episode. 1 if succesful, 
# 0 otherwise. success_rate shows sliding scale of last 100
# episodes. The first 99 entries remain 0.
success_array = np.zeros(shape=(num_episodes))
success_rate = np.zeros(shape=(num_episodes))

with tf.Session() as sess:
    init = tf.initializers.global_variables()
    sess.run(init)
    
    for episode in range(num_episodes):
        
        # Reset the experiment: start at square 1
        state_now = env.reset()
        done = False
        reward = 0
        
        num_steps = 50
        
        info = np.zeros(shape=(num_steps, 5))
        
        for j in range(num_steps):
            
            # Find the estimate for the best action
            feed_dict = {inputs: [np.eye(16)[state_now]]}
            wants = [prediction, Q1]
            action, Y = sess.run(wants, feed_dict=feed_dict)
            
            # Do a random action with probability epsilon, where epsilon
            # diminishes over time (so less and less exploration). Eventually
            # there is no exploration occuring anymore
            if epsilon > np.random.rand(1):
                action[0] = env.action_space.sample()
#                epsilon -= 10**-3
            
            # Take a step
            state_next, reward, done, _ = env.step(action[0])
            
            # Evaluate expected reward from this step and adjust weights
            feed_dict = {inputs: [np.eye(16)[state_next]]}
            wants = Q1
            Y1 = sess.run(wants, feed_dict=feed_dict)
            change_Y = Y
            change_Y[0, action[0]] = reward + gamma*np.max(Y1)
            
            # Update the weights by moving the weights slightly in the direction
            # of the newly found Q row. The new Q values are not immediately
            # taken from Bellman's equations, but nudged a little bit in the
            # correct direction
            feed_dict = {inputs: [np.eye(16)[state_now]], \
                         Q2: change_Y}
            wants = [updatedweights, weights]
            _, new_weights = sess.run(wants, feed_dict=feed_dict)
            
            # Add the reward and get ready for the next iteration
            totalReward += reward
            state_now = state_next
            
            # Input the summary statistics
            if reward == 1:
                success_array[episode] = 1
            if episode > 98:
                success_rate[episode] = \
                        np.sum(success_array[episode-99:episode+1])/100

    fig, ax = plt.subplots()
    ax.plot(np.arange(100, num_episodes), success_rate[100:])
    ax.set_xlabel('Episodes')
    ax.set_ylabel('Proportion successful')
    ax.set_title('Learning, exploration constant')
    plt.savefig('constant.pdf')