In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym

# Create model class in Tensorflow

In [3]:
env = gym.make('FrozenLake-v0')

In [4]:
def make_onehot(label):
    """Change numbers 0-15 to unit vectors along the first 16 axes."""
    
    label_onehot = np.zeros(shape=(16))
    label_onehot[label] = 1.0
    
    return label_onehot

In [94]:
class FrozenLake():
    def __init__(self, gamma, learn_param, epsilon):
        self.gamma = gamma
        self.learn_param = learn_param
        self.epsilon = epsilon
        
        # State is one-hot vector, 16 possible locations
        self.state_oh = tf.placeholder(shape=[1, 16], dtype=tf.float32)
        self.sess = tf.Session()
        self._build_graph()
        self.sess.run(tf.initializers.global_variables())
        
        
    def _build_graph(self):
        """At each state, there are 4 possible actions. The predicted optimal action
        is to the state with the highest Q value"""
        self.weights = tf.Variable(tf.random_uniform([16, 4], minval=0, maxval=0.1))
        self.Q1 = tf.matmul(self.state_oh, self.weights)
        self.prediction = tf.argmax(self.Q1, axis=1)
            
        """Construct the graph for Q learning"""
        self.Q2 = tf.placeholder(shape=[1, 4], dtype=tf.float32)
        loss = tf.reduce_sum(tf.square(self.Q2 - self.Q1))
        gdo = tf.train.GradientDescentOptimizer(learning_rate = self.learn_param)
        self.updatedweights = gdo.minimize(loss)       
        
    def train_model(self, num_steps, num_episodes):
        """Conduct reinforcement learning to optimise Q"""
            
        reward_total = 0
        # success_array shows 1 if episode was successful, 0 otherwise. success_rate is 
        # proportion of last 100 runs that was successful (first 99 entries remain 0)
        success_array = np.zeros(shape=(num_episodes))
        success_rate = np.zeros(shape=(num_episodes))
        
        for episode in range(num_episodes):
            
            # Reset experiment
            state_curr, done, reward = env.reset(), False, 0
                
            info = []
            info.append(1)
            for step in range(num_steps):
                    
                # Determine best action
                feed_dict = {self.state_oh: [make_onehot(state_curr)]}
                action, Y = self.sess.run([self.prediction, self.Q1], feed_dict=feed_dict)
                print('state_curr: {}'.format(state_curr))
                    
                # Take a step. Do random action with probability epsilon, where epsilon 
                # diminishes over time (less and less exploration)
#                if self.epsilon > np.random.rand(1):
#                    action[0] = env.action_space.sample()
#                    self.epsilon -= 10**-3
                state_next, reward, done, _ = env.step(action[0])
    
                print('action: {}'.format(action[0]))
                print('reward: {}'.format(reward))
                print('state_next: {}'.format(state_next))
                    
                # Evaluate expected reward from this step and calculate second term in 
                # Bellman equation iteration step
                feed_dict = {self.state_oh: [make_onehot(state_next)]}
                Y1 = self.sess.run(self.Q1, feed_dict=feed_dict)
                change_Y = Y.copy()
                change_Y[0, action[0]] = reward + gamma*np.max(Y1)
                    
                # Update weights by moving them slightly towards change_Y — this
                # corresponds to the Bellman equation.
                feed_dict = {self.state_oh: [make_onehot(state_next)], self.Q2: change_Y}
                _, new_weights = \
                        self.sess.run([self.updatedweights,self.weights], feed_dict=feed_dict)
                print(Y[0])
                print(change_Y[0])
                print(new_weights[:6])
                    
                # Add the reward and get ready for next iteration
                reward_total += reward
                state_curr = state_next
                info.append(state_curr)
                
                print('')
                
                if done:
                    if reward == 1: print(episode)
                    break
                    
#                if reward == 1:
#                    success_array[episode] = 1
#                    print(episode)
#                if episode > 98:
#                    success_rate[episode] - np.sum(success_array[episode-99:episode+1])/100

In [95]:
gamma = 0.9
learn_param = 0.1
epsilon = 0.1
model = FrozenLake(gamma=gamma, learn_param=learn_param, epsilon=epsilon)
model.train_model(num_steps=100, num_episodes=1)

state_curr: 0
action: 3
reward: 0.0
state_next: 1
[0.08663105 0.07526981 0.03512669 0.09116941]
[0.08663105 0.07526981 0.03512669 0.06993104]
[[0.08663105 0.07526981 0.03512669 0.09116941]
 [0.07948713 0.05724798 0.0109294  0.05573143]
 [0.00485086 0.09917449 0.01917681 0.08917377]
 [0.02985833 0.09743996 0.02228588 0.08596214]
 [0.03608974 0.09987166 0.03262559 0.06735255]
 [0.07045231 0.08531383 0.07180941 0.06034572]]

state_curr: 1
action: 0
reward: 0.0
state_next: 1
[0.07948713 0.05724798 0.0109294  0.05573143]
[0.07153842 0.05724798 0.0109294  0.05573143]
[[0.08663105 0.07526981 0.03512669 0.09116941]
 [0.07789738 0.05724798 0.0109294  0.05573143]
 [0.00485086 0.09917449 0.01917681 0.08917377]
 [0.02985833 0.09743996 0.02228588 0.08596214]
 [0.03608974 0.09987166 0.03262559 0.06735255]
 [0.07045231 0.08531383 0.07180941 0.06034572]]

state_curr: 1
action: 0
reward: 0.0
state_next: 1
[0.07789738 0.05724798 0.0109294  0.05573143]
[0.07010765 0.05724798 0.0109294  0.05573143]
[[0.08