### SARSA: MountainCar problem
***


##### This notebook solves the problem of the mountain car presented in (https://gym.openai.com/envs/MountainCar-v0/) using **SARSA** reinforcement learning algorithm. 


In [None]:
import base64
import glob
import io
import gym
import  numpy as np
from gym import wrappers
from IPython.display import HTML
from IPython import display as ipythondisplay
from IPython.display import Video
from tqdm import tqdm

In [None]:
# getting gym environment
def get_env(env_name):
    return gym.make(env_name)

In [None]:
# showing the video saved at ./video/ location
def show_video():
    video_list = glob.glob('video/*.mp4')
    if len(video_list) > 0:
        ipythondisplay.display(HTML("""
                                    <video alt="test" controls>
                                        <source src="{}" type="video/mp4">
                                    </video>
                                """.format(video_list[0])))
    else:
        print('No videos found.')

In [4]:
# setup environment
env = get_env('MountainCar-v0')
env = wrappers.Monitor(env, 'video/', force=True)
observation = env.reset()
env.render()

True

##### We can try to play one episode of the game, using random actions at each time step.

In [5]:
# one initial episode using random actions
done = False
cnt = 0
total_reward = 0
while not done:
    cnt += 1
    action = env.action_space.sample()
    observation, reward, done, _ = env.step(action)
    total_reward += reward
    if done:
        break
print("number of steps: {}".format(cnt))
print('Total reward: {}'.format(total_reward))
env.close()

number of steps: 200
Total reward: -200.0


In [6]:
show_video()

In [7]:
# Model parameters
NUM_FEATURES = 2
MIN_POS = -1.2
MAX_POS = 0.6
MIN_VELOCITY = -0.07 
MAX_VELOCITY = 0.07
NUM_STATES = 100
NUM_ACTIONS = 3
NUM_BINS = 10
EPSILON_DECAY = 0.9
EPSILON = 0.5

In [8]:
# Getting discrete representation of continuous features
def get_discrete_bins(ranges, num_bins=10):
    bins = []
    for r in ranges:
        bins.append(np.linspace(r[0], r[1], num_bins))
    return bins

In [9]:
# Transforming state features from continuos to discrete
def get_discrete_state(state):
    bins = get_discrete_bins([(MIN_POS, MAX_POS), (MIN_VELOCITY, MAX_VELOCITY)])
    discrete_state = np.zeros(NUM_FEATURES)
    for i, feature in enumerate(state):
        linspace = bins[i]
        curr_bin = 0
        while feature > linspace[curr_bin]:
            curr_bin += 1
        discrete_state[i] = curr_bin
    return discrete_state

In [10]:
# Maps states between state number and tuple representing state features
def map_states():
    state_to_num = {}
    num_to_state = {}
    curr_num = 0
    for i in range(NUM_BINS):
        for j in range(NUM_BINS):
            state_to_num[(i, j)] = curr_num
            num_to_state[curr_num] = (i, j)
            curr_num += 1
    return state_to_num, num_to_state

In [11]:
state_to_num, num_to_state = map_states()

In [12]:
def init_Q(num_states, num_actions):
    Q = np.zeros((num_states, num_actions))
    return Q

In [13]:
Q = init_Q(NUM_STATES, NUM_ACTIONS)

In [14]:
# SARSA parameters
ALPHA = 0.1
GAMMA = 0.9

In [15]:
# epsilon-greedy policy for choosing action
def choose_action(Q, epsilon, state_num):
    action_type = np.random.binomial(n=1, p=epsilon, size=1)
    if action_type == 1:
        # perform random action
        random_action = np.random.uniform(0, NUM_ACTIONS, size=1)
        random_action = np.trunc(random_action[0])
        return int(random_action)
    else:
        # perform the best action
        best_action = np.argmax(Q[state_num])
        return int(best_action)

In [16]:
# epsilon decay over episodes
def decrease_epsilon(epsilon, episode_num):
    return epsilon * EPSILON_DECAY

In [17]:
def train_sarsa(env, Q, num_episodes, epsilon):
    for episode_num in tqdm(range(num_episodes)):
        observation = env.reset()
        done = False
        total_reward = 0
        epsilon = decrease_epsilon(epsilon, episode_num)
        while not done:
            # discretize old state
            old_state = get_discrete_state(observation)
            # save old state
            old_state = state_to_num[tuple(old_state)]
            # chose action
            action = choose_action(Q, epsilon, old_state)
            # make step
            observation, reward, done, _ = env.step(action)
            # discretize new state
            new_state = get_discrete_state(observation)
            # save new state
            new_state = state_to_num[tuple(new_state)]
            # get new action from new state
            new_action = choose_action(Q, epsilon=epsilon, state_num=new_state)
            # update Q value
            Q[old_state, action] += ALPHA * (reward + GAMMA * Q[new_state, new_action] - Q[old_state, action])
            total_reward += reward
            if done:
                break
    env.close()

In [18]:
env =  get_env('MountainCar-v0')

In [None]:
train_sarsa(env, Q, num_episodes=10000, epsilon=0.5)

 61%|███████████████████████████████████████████████████████████████████████████████▊                                                   | 6091/10000 [04:55<02:48, 23.20it/s]

In [None]:
def play_optimal(env, Q):
    # setup recording
    env = wrappers.Monitor(env, 'video/', force=True)
    observation = env.reset()
    env.render()
    # play one episode
    done = False
    total_reward = 0
    while not done:
        discrete_observation = get_discrete_state(observation)
        state_num = state_to_num[tuple(discrete_observation)]
        action = np.argmax(Q[state_num])
        observation, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            break

    print('Total reward: {}'.format(total_reward))
    env.close()

In [None]:
env =  get_env('MountainCar-v0')

In [None]:
Q

In [None]:
play_optimal(env, Q)

In [None]:
show_video()