In [16]:
import gym
import random
import numpy as np

In [71]:
class CartPoleEnvironment():

    def __init__(self, buckets=(1, 1, 6, 12,)):
        self.env = gym.make('CartPole-v1')
        self.buckets = buckets

    def discretize(self, states, boxes):
        num_box = [None]*4
        #print(states)
        for i in range(4):
            box = boxes[i]
            for j in range(len(box)-1):
                if states[i] >= box[j] and states[i] <= box[j+1]:
                    num_box[i] = j
        return tuple(num_box)
    def init_q(self,boxes):
        dims = []
        action = [-10,10]
        for i in range(len(boxes)):
            dims.append(len(boxes[i])-1)
        dims.append(len(action))
        return np.zeros(dims)   

In [76]:
class CartPoleAgent():

    def __init__(self, alpha=0.5, epsilon=1, episodes=5_000):
        self.em = CartPoleEnvironment()
        self.boxes = [[-12,-6,-1,0,1,6,12],[-2.4,-0.8,0.8,2.4],[-np.inf,-50,50,np.inf],[-np.inf,-0.5,0.5,np.inf]]
        self.q_table = self.em.init_q(self.boxes)
        self.alpha = alpha
        self.epsilon = epsilon
        self.episodes = episodes

    def update_q_value(self, state, action, reward, new_state):
        """
        Using Bellman equation, update Q-value based on state-action pair
        Q(s, a) <- Q(s, a) + alpha(curr_reward + gamma * max(Q(s', a')) - Q(s, a))
        
        where max(Q(s', a') is the best future reward, and gamma = 1
        """
        prev_q = self.q_table[state][action]
        future_reward = self.best_future_reward(new_state)

        self.q_table[state][action] = prev_q + self.alpha * (reward + future_reward - prev_q)
        
    def update_q_value2(self,state,action,reward,new_state,learn_rate,discount_rate):
        future_reward = self.best_future_reward(new_state)
        self.q_table[state][action] = learn_rate*(reward + discount_rate*future_reward) -self.q_table[state][action]

    def best_future_reward(self, state):
        return np.max(self.q_table[state])

    def choose_action(self, state, epsilon=True):
        """
        Action is chosen using epsilon-greedy algorithm
        """
        best_action = np.argmax(self.q_table[state])
        random_action = self.em.env.action_space.sample()

        if epsilon:
            if random.random() > 1-self.epsilon:
                return best_action
            else:
                return random_action
        else:
            return best_action

    def train(self):
        """
        Train for 5,000 episodes where at each episode the exploration is decayed.
        """
        rewards = []
        MAX_STEPS = 200
        MAX_EXP_RATE = 1
        MIN_EXP_RATE = 0.01
        EXP_DECAY_RATE = 0.001
        learn_rate0 = 0.05
        learn_rate_decay = 0.95
        exploration_decay_rate=0.99
        for ep in range(self.episodes*3):
            env = self.em.env
            state = self.em.discretize(env.reset(),self.boxes)
            self.epsilon = self.epsilon*exploration_decay_rate#MIN_EXP_RATE + (MAX_EXP_RATE - MIN_EXP_RATE) * np.exp(-EXP_DECAY_RATE * ep)
            done = False
            episode_rewards = 0
            step = 0
            gamma = 0.95
            gamma0 = gamma
            while not done and step < MAX_STEPS:

                action = self.choose_action(state,self.epsilon)

                # take action
                new_state, reward, done, _ = env.step(action)
                new_state = self.em.discretize(new_state,self.boxes)
           
                # accummulate rewards
                reward = reward*gamma0
                episode_rewards += reward
                gamma0 = gamma*gamma0
                # update Q-table 
                learn_rate = learn_rate0/(1+step*learn_rate_decay)
                self.update_q_value2(state, action, reward, new_state,learn_rate,gamma0)

                # transition to the new state
                state = new_state

                step += 1

            if done: 
                print(f"Episode {ep} finished after {step + 1} timesteps.")
            
            rewards.append(episode_rewards)
        
        rewards_per_thousand_ep = np.split(np.array(rewards), self.episodes/500)
        count = 500

        for r in rewards_per_thousand_ep:
            print(f"{count}: {int(sum(r/500))}")
            count += 500

    def play(self):
        """
        Playing using the populated Q-table; we want to exploit the Q-values.
        So we will not use epsilon-greedy algorithm and only select the max Q-value.
        """
        env = self.em.env
        env._max_episode_steps = 1000
        state = self.em.discretize(env.reset(),self.boxes)
        done = False
        rewards = 0
        while not done:
            action = self.choose_action(state, epsilon=False)

            # take action
            new_state, reward, done, _ = env.step(action)
            new_state = self.em.discretize(new_state,self.boxes)
            rewards += reward         

            # transition to the new state
            state = new_state

        print(f"Agent finished with a reward of {rewards}")
        env.close()

In [79]:
agent = CartPoleAgent()
agent.train()

Episode 0 finished after 13 timesteps.
Episode 1 finished after 21 timesteps.
Episode 2 finished after 19 timesteps.
Episode 3 finished after 21 timesteps.
Episode 4 finished after 39 timesteps.
Episode 5 finished after 23 timesteps.
Episode 6 finished after 57 timesteps.
Episode 7 finished after 11 timesteps.
Episode 8 finished after 14 timesteps.
Episode 9 finished after 10 timesteps.
Episode 10 finished after 11 timesteps.
Episode 11 finished after 21 timesteps.
Episode 12 finished after 13 timesteps.
Episode 13 finished after 17 timesteps.
Episode 14 finished after 9 timesteps.
Episode 15 finished after 15 timesteps.
Episode 16 finished after 12 timesteps.
Episode 17 finished after 10 timesteps.
Episode 18 finished after 9 timesteps.
Episode 19 finished after 18 timesteps.
Episode 20 finished after 14 timesteps.
Episode 21 finished after 11 timesteps.
Episode 22 finished after 13 timesteps.
Episode 23 finished after 16 timesteps.
Episode 24 finished after 14 timesteps.
Episode 25 f

IndexError: index 1 is out of bounds for axis 0 with size 1

In [78]:
agent.play()

Agent finished with a reward of 9.0
