In [14]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import random

CartPole Setup

In [3]:
env = gym.make('CartPole-v1', new_step_api=True)

In [4]:
obs_state = env.observation_space
action_space = env.action_space

Observation space represents a 1 dimentional array with 4 discrete variables corresponding to the cart position (0), cart velocity (1), pole angle (2), and pole angular velocity (3) respectively. The action space represents another 1 dimensional array with all the possible actions that can be taken. There are only two actions: push cart to the left (0) and push cart to the right (1). 

Before implementing the SARSA algorithm, we created a block of code which randomly samples an action from the action space. This will serve as a baseline which we can compare against our model.

In [None]:
random_results = []
steps = 0
for n in range(500):
    steps += 1
    state = env.reset()
    action = action_space.sample()
    state, reward, end, _, info = env.step(action)
    if end:
        random_results.append(steps)
        steps = 0
        env.reset()
env.close()
print(sum(random_results) / len(random_results))

SARSA EpsGreedyQ Implementation

In [5]:
# Parameters
lr_rate = 0.05
discount = 0.95
episodes = 100000
max_steps = 500

# Epsilon Greedy Search Parameters
epsilon = 1.0 
min_epsilon = 0.1 
decay_rate = 0.01 

In [8]:
Q = np.zeros((obs_state.shape[0], action_space.n))

In [9]:
Q

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [10]:
def eps_greedy_action(state): 
    action = 0
    if np.random.uniform() < epsilon:
        # Number lower than epsilon means explore
        action = random.randint(0,1)
    else:
        # Higher number than epsilon means use prior knowledge
        action = np.argmax(Q[state, :])
    return action

In [20]:
def train():
    step = 0
    state1 = env.reset()
    done = False
    action1 = eps_greedy_action(state1)
    for step in range(max_steps):
        state2, reward, end, _, info = env.step(action1)
        action2 = eps_greedy_action(state2)
        Q[state1][action1] = Q[state1][action1] + lr_rate * (reward + discount * (Q[state2][action2]) - Q[state1][action1])
        state1 = state2
        action1 = action2
        if end:
            break

In [21]:
def test():
    rewards = []
    for n in range(episodes):
        step = 0
        state1 = env.reset()
        done = False
        ep_reward = 0
        for step in range(max_steps):
            action = np.argmax(Q[state1, :])
            state2, reward, end, _, info = env.step(action)
            ep_reward += reward
            state1 = state2
            if end:
                break
        rewards.append(ep_reward)
    return rewards

In [None]:
train()

In [None]:
test()