#### Import necessary libraries

In [1]:
import gym
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
env = gym.make('CartPole-v1')
print(env.reset()[0])
print(env.step(0))

new_state, reward, done, _, _ = env.step(1)
print(new_state)
print(env.action_space.sample())

q_table = np.random.uniform(low=0, high=1, size=([3, 3, 6, 6] + [env.action_space.n]))
q_table[(0, 2, 2, 1), 0] = 8.9
q_table[(0, 2, 2, 1), 1] = 9.3
print(np.argmax(q_table[(0, 2, 2, 1)]))
q_table.shape

[0.00021462 0.04362775 0.01514306 0.00641605]
(array([ 0.00108717, -0.15170807,  0.01527138,  0.30383807], dtype=float32), 1.0, False, False, {})
[-0.00194699  0.04319295  0.02134814  0.01601024]
0
0


(3, 3, 6, 6, 2)

### First we'll define the CartPole agent as a class, and then provide training and inference methods on that class. Along with that, we'll also provide some other utility functions to facilitate plotting rewards and trajectories later.

##### Similarly, the `CartPole-v0` environment provides an observation as the tuple `(position, velocity, angle, velocity at tip)` however, these are continuous variables, which means that Q Learning may take a longer time to converge since the possible permutations of observations are quite large. For this reason we'll also provide a utility to discretize the observations in different bins, which are chosen to be `(3, 3, 6, 6)`, in an attempt to reduce the state space of our agent and make the convergence tractable.

##### In a similar attempt, we'll also define a epison greedy policy, that leverages between exploration and exploitation for the agent, essentially choosing the optimal action (one that maximizes the Q value) with a probability epsilon, and choosing a random action from the action space (for exploration) with probability 1-epsilon.

##### Furthermore, we'll define the learning schedule, which decays the learning rate of the agent in a logarithmic function, so as to facilitate better stability while training, but that also does not fall down below a certain learning rate. Similrly, we'll define an epsilon schedule to decay the epsilon value to facilitate faster convergence for the optimal policy.

##### As the last step, we'll define the Q Learning algorithm which is based on the update rule of 
$\documentclass[a4paper,12pt]{article}
\usepackage{amssymb,mathtools}
\begin{document}
\[\underbrace{\text{New}Q(s,a)}_{\scriptstyle\text{New Q-Value}}=Q(s,a)+\mkern-34mu\underset{\text{New Q-Value}}{\underset{\Bigl|}{\alpha}}\mkern-30mu[\underbrace{R(s,a)}_{\scriptstyle\text{Reward}}+\mkern-30mu\underset{\text{Discount rate}}{\underset{\Biggl|}{\gamma}}\mkern-75mu\overbrace{\max Q'(s',a')}^{\scriptstyle\substack{\text{Maximum predicted reward, given} \\ \text{new state and all possible actions}}}\mkern-45mu-Q(s,a)]\]
\end{document} $

In [12]:
class Agent():
    def __init__(self, bins=[30, 30, 50, 50], 
                 num_episodes=60000, min_lr=0.1, 
                 min_epsilon=1, discount=0.95, decay=0.99995):
        self.bins = bins
        self.num_episodes = num_episodes
        self.min_lr = min_lr
        self.min_epsilon = min_epsilon
        self.discount = discount
        self.decay = decay
        

        self.env = gym.make('CartPole-v1')
        
        # This is the action-value function being initialized to 0's
        self.Q_table = np.random.uniform(low=0, high=1, size=(self.bins + [self.env.action_space.n]))
        # [position, velocity, angle, angular velocity]
        self.upper_bin = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50) / 1.]
        self.lower_bin = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50) / 1.]
        
        #
        self.steps = np.zeros(self.num_episodes)
        
        

    def discretize(self, obs):
        """
        Discretizes an observation by binning similar continuous value under the same discrete bin to reduce state space.

        
        Input:
        obs (tuple): Tuple describing the observation of the current environment
        
        Output:
        output (tuple): Tuple describing the observation of the current environment with discrete values
        """
        output = obs / np.array([0.25, 0.25, 0.01, 0.1]) + np.array([15, 10, 1, 10])
        return tuple(output.astype(np.int))

        

    def choose_action(self, state):
        """
        Epsilon greedy algorithm: In Reinforcement Learning, agents must strike a balance between exploration and exploitation.
        Essentially, though it is optimal in the short term rewards to choose the action that maximises the Q value, nonetheless, such a policy
        might be too myopic in its exploration, and thus not explore states which may have a lower reward in the short term, but higher reward in
        the long run. Therefore, with probability epsilon, we explore, ie. choose the next action by random, and with probability 1-epsilon, we 
        greedily choose the maximizing Q value action.        
        Input:
        state (tuple): Tuple containing 4 non-negative integers within
                       the range of the buckets.
        
        Output:
        (int) Returns either 0 or 1
        """
        if (np.random.random() <= self.min_epsilon):
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.Q_table[state])

    def q(self, state, action, reward, new_state):
        """
        Q value update step
        """
        current = self.Q_table[state][action]
        latest = (1-self.learning_rate) * current + self.learning_rate * (reward * self.discount*np.max(self.Q_table[new_state]))
        self.Q_table[state][action] = latest

    def epsilon_schedule(self, t):
        """Gets value for epsilon. It declines as we advance in episodes."""
        # Ensures that there's almost at least a min_epsilon chance of randomly exploring
        return max(self.min_epsilon, min(1., 1. - math.log10((t + 1) / self.decay)))

    def learning_schedule(self, t):
        """Gets value for learning rate. It declines as we advance in episodes."""
        # Learning rate also declines as we add more episodes
        return max(self.min_lr, min(1., 1. - math.log10((t + 1) / self.decay)))

    def train(self):
        """
        Trains agent making it go through the environment and choose actions
        through an e-greedy policy and updating values for its Q-table. The 
        agent is trained by default for 500 episodes with a declining 
        learning rate and epsilon values that with the default values,
        reach the minimum after 198 episodes.
        """
        rewards = []
        # Looping for each episode
        for e in range(self.num_episodes):
            # Initializes the state
            current_state = self.discretize(self.env.reset()[0])
            self.learning_rate = self.min_lr
            self.epsilon = self.min_epsilon
            done = False
            episode_reward = 0
            # Looping for each step
            while not done:
                self.steps[e] += 1
                # Choose A from S
                action = self.choose_action(current_state)
                # Take action
                obs, reward, done, _, _ = self.env.step(action)
                new_state = self.discretize(obs)
                # Update Q(S,A)
                if not done:
                    self.q(current_state, action, reward, new_state)
                current_state = new_state
                episode_reward += reward
            rewards.append(episode_reward)
            if self.min_epsilon > 0.05:
                if e > 10000 and episode_reward > rewards[e-1]:
                    self.min_epsilon = math.pow(self.decay, e-10000)
                    print(f'new eps: {self.min_epsilon}')
            if e % 1000 == 0:
                print(f'mean reward: {sum(rewards[e-1000:e+1])/1000}')
                
                # We break out of the loop when done is False which is
                # a terminal state.
        print('Finished training!')
        return rewards
    
    def plot_learning(self):

        """
        Plots the number of steps at each episode and prints the
        amount of times that an episode was successfully completed.
        """
        sns.lineplot(range(len(self.steps)),self.steps)
        plt.xlabel("Episode")
        plt.ylabel("Steps")
        plt.show()
        t = 0
        for i in range(self.num_episodes):
            if self.steps[i] == 200:
                t += 1
        print(t, "episodes were successfully completed.")
        

    def run(self):
        """Runs an episode while displaying the cartpole environment."""
        self.env = gym.wrappers.Monitor(self.env,'cartpole')
        t = 0
        done = False
        current_state = self.discretize(self.env.reset())
        while not done:
                self.env.render()
                t = t+1
                action = self.choose_action(current_state)
                obs, reward, done, _, _ = self.env.step(action)
                new_state = self.discretize(obs)
                current_state = new_state
            
        return t   

##### At this point, we'll train the agent in the CartPole-v0 environment via the provided codes above

In [13]:
agent = Agent()
rewards = agent.train()
agent.plot_learning()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return tuple(output.astype(np.int))


mean reward: 0.017
mean reward: 22.621
mean reward: 22.306
mean reward: 22.012
mean reward: 22.479
mean reward: 22.72
mean reward: 22.647
mean reward: 22.735
mean reward: 21.988
mean reward: 22.758
mean reward: 21.961
new eps: 0.99995
new eps: 0.9998500074998751
new eps: 0.9997000374975001
new eps: 0.9995500899895008
new eps: 0.9995001124850014
new eps: 0.9994001649725032
new eps: 0.9993501949642546
new eps: 0.9992002999300115
new eps: 0.999150339915015
new eps: 0.9991003823980192
new eps: 0.9990004748575304
new eps: 0.9989005773075458
new eps: 0.9987507497125792
new eps: 0.9986508771344849
new eps: 0.9985010869926714
new eps: 0.9984012393802248
new eps: 0.9983513193182558
new eps: 0.9981516640291629
new eps: 0.9981017564459614
new eps: 0.9980019487655712
new eps: 0.9979021510656996
new eps: 0.9978023633453483
new eps: 0.9977025856035198
new eps: 0.9976527004742396
new eps: 0.9976028178392159
new eps: 0.9975529376983239
new eps: 0.9973534420735796
new eps: 0.9973035744014759
new eps: 0

TypeError: lineplot() takes from 0 to 1 positional arguments but 2 were given

In [14]:
print(rewards[42000])

11.0
