### **Declaration of rendering components**

In [None]:
# imports for setting up diplay for the colab server.
!apt-get update is important for updating path for xvfb library.
!sudo apt-get update  > /dev/null 2>&1
!sudo apt-get install -y xvfb x11-utils  > /dev/null 2>&1 

E: The update command takes no arguments


In [None]:
# version specificity is required for rendering videos.
!pip install gym==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*  > /dev/null 2>&1

In [None]:
# For working with arrays in value approximation.
import numpy as np  
np.random.seed(0)

In [None]:
# gym related import statements.
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
# virtual display related import statements.
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

In [None]:
# This creates virtual display to send the frames for being rendered.
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1366, 768))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1366x768x24', ':1005'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1366x768x24', ':1005'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [None]:
def show_video():
    '''
    This function loads the data video inline into the colab notebook.
    By reading the video stored by the Monitor class.
    '''
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
    

def wrap_env(env):
    '''
    This monitoring tool records the outputs from the output and saves it a
    mp4 file in the stated directory. If we don't change the video directory
    the videos will get stored in 'content/' directory.
    '''
    env = Monitor(env, './video', force=True)
    return env

### **Value function approximation with Q table**

__We iteratively find different values function and policy approximations based on the interactions of agent with the environment. We can't directly in this environment use _Value Iteration_ or _Policy Iteration_ algorithms as we don't have like probability state transition and reward model table available to us like in `Taxi-v3`. The code contains two sections as stated below:__
* __Basic exploration of the `MountainCar-v0` environment.__
* __Developing an agent with value approximation with Q-learning with dynamic programming.__

### **Basic exploration of the `MountainCar-v0` environment.**

In [None]:
# Wraping the environment in the Monitor class.
env = wrap_env(gym.make('MountainCar-v0'))
# Fixing the randomness in the environment.
env.seed(0)

[0]

In [None]:
# Only 3 actions allowed move left(0), not move(1) and move right(2).
print('Action Space for Mountain Car Env: '+str(env.action_space))
# From observation space we get position and speed of the agent.
print('Observation Space for Mountain Car Env: '+str(env.observation_space))

Action Space for Mountain Car Env: Discrete(3)
Observation Space for Mountain Car Env: Box(2,)


In [None]:
print('Observation space and speed values: '+str(env.observation_space.high))
print('Observation space and speed values: '+str(env.observation_space.low))
# With the numeric level granularity of 0.1 for position and 0.01 for velocity.
# We can see that the solution space is huge.

Observation space and speed values: [0.6  0.07]
Observation space and speed values: [-1.2  -0.07]


### **Value function approximation with Q Learning algorithm implementation.**

In [None]:
n_states = 36 # We have to limit the states to small finite number.
iter_max = 10000 # For training our agents.
gamma = 1.0 # Discount factor in learning problem.
ep_max = 10000 # execution for number of episodes in every iteration run.
initial_lr = 1.0 # Learning rate decleration
min_lr = 0.003 # Minimum value of 

In [None]:
def get_state_vals(env, obs):
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states # n_states declared globally
    pos = int((obs[0] - env_low[0])/env_dx[0]) # position value
    vel = int((obs[1] - env_low[1])/env_dx[1]) # velocity value
    return pos, vel

In [None]:
def exec_episodes(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    for _ in range(ep_max):
        if render: # Default False valued assigned for not rendering while iterating.
            env.render()
        if policy is None: # When running a random agent.
            action = env.action_space.sample()
        else:
            pos,vel = get_state_vals(env, obs)
            action = policy[pos][vel] # Policy is having dimensions of nS*nS values.
        obs, reward, done, _ = env.step(action)
        total_reward += gamma ** step_idx * reward # calculating total reward.
        step_idx += 1
        if done:
            break
    return total_reward

In [None]:
# Initializing Q(s,a) table for value function approximation.
q_table = np.zeros((n_states, n_states, 3))

In [None]:
def eval_policy(env):
    solution_policy = np.argmax(q_table, axis=2)
    scores = [exec_episodes(env, solution_policy, False) for _ in range(100)]
    return np.mean(scores)

In [None]:
# Initializing the environment again to default state.
env = wrap_env(gym.make('MountainCar-v0'))
# Fixing the environment seed value.
env.seed(0)

for i in range(iter_max):
    obs = env.reset()
    ## eta: learning rate is decreased after each iteration of execution.
    eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
    for j in range(ep_max):
        pos, vel = get_state_vals(env, obs)
        if np.random.uniform(0, 1) < 0.02: # eps = 0.02, greedy policy with 2% exploration.
            action = np.random.choice(env.action_space.n)
        else:
            logits = q_table[pos][vel]
            logits_exp = np.exp(logits)
            probs = logits_exp / np.sum(logits_exp)
            action = np.random.choice(env.action_space.n, p=probs)
        obs, reward, done, _ = env.step(action)
        # update q table based on Bellmans Equation Rule with dynamic programming.
        pos_, vel_ = get_state_vals(env, obs)
        q_table[pos][vel][action] = q_table[pos][vel][action] + eta * (reward + gamma *  np.max(q_table[pos_][vel_]) - q_table[pos][vel][action])
        if done:
            break 
    if i % 1000 == 0:
        print('Iteration: %d has been completed.' %(i+1))

Iteration: 1 has been completed.
Iteration: 1001 has been completed.
Iteration: 2001 has been completed.
Iteration: 3001 has been completed.
Iteration: 4001 has been completed.
Iteration: 5001 has been completed.
Iteration: 6001 has been completed.
Iteration: 7001 has been completed.
Iteration: 8001 has been completed.
Iteration: 9001 has been completed.


In [None]:
# For displaying the video output of the final result achieved.
show_video()

In [None]:
# Average converged reward values.
eval_policy(env)

-124.74

In [None]:
# zipping the video folder for the given SARSA agent.
!zip -r /content/file.zip /content/video
# downloading the file resource.
from google.colab import files
files.download("/content/file.zip")

updating: content/video/ (stored 0%)
  adding: content/video/openaigym.video.3.1950.video005000.mp4 (deflated 7%)
  adding: content/video/openaigym.video.3.1950.video000008.mp4 (deflated 8%)
  adding: content/video/openaigym.video.3.1950.video000729.meta.json (deflated 60%)
  adding: content/video/openaigym.video.3.1950.video000027.meta.json (deflated 60%)
  adding: content/video/openaigym.video.3.1950.video000001.mp4 (deflated 7%)
  adding: content/video/openaigym.video.3.1950.video008000.mp4 (deflated 7%)
  adding: content/video/openaigym.video.3.1950.video001000.mp4 (deflated 7%)
  adding: content/video/openaigym.video.3.1950.video005000.meta.json (deflated 60%)
  adding: content/video/openaigym.video.3.1950.video003000.meta.json (deflated 60%)
  adding: content/video/openaigym.video.3.1950.video000216.mp4 (deflated 7%)
  adding: content/video/openaigym.video.3.1950.video006000.mp4 (deflated 7%)
  adding: content/video/openaigym.video.3.1950.video000000.mp4 (deflated 7%)
  adding: c

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>