In [21]:
import gym
import sys
import numpy as np

from gym import error, spaces, utils
from copy import deepcopy

import matplotlib.pyplot as plt

from tqdm import tqdm

from collections import defaultdict

In [22]:
env = gym.make("MountainCar-v0")
print(env.action_space.n)

# there are only two observables - position and velocity
print(env.observation_space.high)  # the high values of the observations
print(env.observation_space.low)  # the low values

3
[0.6  0.07]
[-1.2  -0.07]


In [23]:
env.action_space.n

3

In [24]:
num_states = (env.observation_space.high - env.observation_space.low) * np.array([10, 100])
num_states = np.round(num_states, 0).astype(int) + 1
num_states

array([19, 15])

In [25]:
Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
Q.shape

(19, 15, 3)

In [26]:
'''
        The MountainCar env:

        Observation Space: there are two observables - position and velocity
        position of the car along the x-axis  in range [-1.2, 0.6]
        velocity of the car  in range [-0.07, 0.07]
'''

'\n        The MountainCar env:\n\n        Observation Space: there are two observables - position and velocity\n        position of the car along the x-axis  in range [-1.2, 0.6]\n        velocity of the car  in range [-0.07, 0.07]\n'

In [27]:
class Q_learning_agent(object):
    def __init__(self, obs, act_n = 3, learning_rate=0.01, gamma=0.9, e_greed=0.1):

        self.lr = learning_rate
        self.gamma = gamma
        self.epsilon = e_greed

        '''
        There are 19 * 15 observation space in total
        19 means there are 19 different x-posoition
        15 means there are 15 different velocity
        num_state = [0.6 - (-1.2), 0.07 - (-0.07)] * [10, 100] + 1 = [18, 14] + 1 = [19, 15]
        '''
        n_states = (obs.high - obs.low) * np.array([10, 100])
        self.num_states = np.round(n_states, 0).astype(int) + 1

        '''
        There are 3 action in action space
        0: Accelerate to the left
        1: Don't accelerate  
        2: Accelerate to the right
        '''
        self.act_n = act_n

        '''
        Q as a shape of (19, 15, 3)
        '''
        self.Q = np.random.uniform(low = -1, high = 1, size = (self.num_states[0], self.num_states[1], self.act_n))
        
    
    def policy(self, state):
        if np.random.uniform(0, 1) < (1.0 - self.epsilon):
            action = self.predict(state)
        else:
            action = np.random.choice(self.act_n)

        return action

    def predict(self, state):
        action = np.argmax(Q[state[0], state[1]]) 
        return action
        
    def learn(self, state, action, reward, state_prime, action_prime, done):
        predit_Q = self.Q[state[0], state[1], action]

        if (done):
            target_Q = reward
        else:
            target_Q = reward + self.gamma * np.max(self.Q[state_prime[0], state_prime[1]])

        self.Q[state[0], state[1], action] += self.lr * (target_Q - predit_Q)

In [28]:
def run_episode(env, agent, is_render):
    state = env.reset()
    state = (state - env.observation_space.low)*np.array([10, 100])
    state = np.round(state, 0).astype(int)

    action = agent.policy(state)
    
    total_reward = 0
    while (True):
        if (is_render):
            env.render()
            
        S_prime, reward, done, _ = env.step(action)

        S_prime = (S_prime - env.observation_space.low)*np.array([10, 100])
        S_prime = np.round(S_prime, 0).astype(int)

        action_prime = agent.policy(S_prime)

        agent.learn(state, action, reward, S_prime, action_prime, done)

        action = action_prime
        state = S_prime
        total_reward += reward

        if done:
            break
    
    return total_reward

In [29]:
def main():
    env = gym.make("MountainCar-v0")

    q_agent = Q_learning_agent(obs=env.observation_space,  act_n=env.action_space.n, 
        learning_rate=0.5, gamma = 0.9, e_greed=0.1)

    q_reward_list = []

    is_render = False

    for episode in range(500):
        if (episode%20 == 0):
            is_render = True
        q_reward = run_episode(env, q_agent, is_render)
        q_reward_list.append(q_reward)

    return q_reward_list

In [30]:
reward_history = main()

TypeError: 'numpy.float64' object cannot be interpreted as an integer