Mountain Car Problem

Import Statements

In [None]:
import gym
import numpy as np

Establish Gym Environment

In [None]:
env = gym.make('MountainCar-v0')
env.reset()

Initiate and Define Constant values 

Define empty Q Table
Define alpha, beta, and epsilon values used to calculate values for q table. 
Define number of episodes and steps per episode

In [None]:
qTable = {}

alpha = 1.0
beta = 0.5
epsilon = 0.2

episodes = 100
steps = 10000

Define Get Action function

- This function accepts an observation and an index. Position and velocity for the given observation are established
- If either a random sample number generated by numpy is less than the epsilon value or there is no value in the Q Table for the given position and velocity, a sample action value is returned
- The value for the position, velocity and velocity are set to 0 in the Q Table so that the if check will pass the next time
- If a value for the position and action are found in the Q Table, the function returns the highest value

In [None]:
def getAction(observation, index):
    position = int(round(observation[0], 1))
    velocity = int(round(observation[1], 1))
    action = 0

    if np.random.random_sample() < epsilon or ((position, velocity, action) not in qTable):
        return env.action_space.sample()
    elif ((position, velocity, action) not in qTable):
        for a in range(3):
            qTable[(position, velocity, a)] = 0
        return env.action_space.sample()
    else:
        maxQ = qTable[(position, velocity, 0)]
        index = 0
        for a in range(3):
            if maxQ < qTable[(position, velocity, a)]:
                maxQ = qTable[(position, velocity, a)]
                index = a
        return a

Define Update Q Table Function

- This function is passed 2 observations, an action, and a reward
- The position and velocity for each observation is extracted
- If the position, velocity, and action are not in the Q Table, the values are added
- If they are in the table, the maximum Q Value is extracted
- The second observation is checked against the Q Table
- If the values do not exist, they are added
- The Q table is then updated to reflect the alpha and beta calculations

In [None]:
def updateQTable(previousObservation, observation, action, reward):
    position = int(round(observation[0], 1))
    velocity = int(round(observation[1], 1))
    previousPosition = int(round(previousObservation[0], 1))
    previousVelocity = int(round(previousObservation[1], 1))
    maxQ = 0

    if (position, velocity, action) in qTable:
        maxQ = qTable[(position, velocity, 0)]
        for a in range(3):
            maxQ = max(maxQ, qTable[(position, velocity, a)])
    else:
        for a in range(3):
            qTable[(position, velocity, a)] = 0
    if (previousPosition, previousVelocity, action) not in qTable:
        for a in range(3):
            qTable[(previousPosition, previousVelocity, a)] = 0
    qTable[(previousPosition, previousVelocity, action)] += alpha * (reward + beta * maxQ - qTable[(previousPosition, previousVelocity, action)])

Main Function

- For each new episode, an observation is taken to start
- The beta value is also reduced by 1%
- For each step, the Get Action is called based on the initial observation
- Based on the result of that function, an action is taken
- A reward is calculated based off of that action, and the Q Table is updated accordingly
- The alpha value is also reduced by 1%
- If the initial observation value is greater than or equal to 0.5, this is considered reaching the flag, and the episode is completed

In [None]:
if __name__ == '__main__':
    for _ in range(episodes):
        print("Episode: ", _)

        observation = env.reset()
        previousObservation = observation
        beta *= 0.99

        for step in range(steps):
            action = getAction(previousObservation, step)
            observation, reward, done, info = env.step(action)
            reward = abs(observation[0]) - 0.5 + abs(observation[1])
            updateQTable(previousObservation, observation, action, reward)
            alpha *= 0.99

            if observation[0] >= 0.5:
                print("Number of Steps: ", step)
                break
    env.close()
