In [1]:
# Cloning the github branch in the '/content/' directory.
# installing the highway-env package.
!pip install git+https://github.com/eleurent/highway-env.git --quiet

# Cloning the github branch in the '/content/' directory.
# installing the finite-mdp package.
!pip install git+https://github.com/eleurent/finite-mdp.git --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 11.8MB 250kB/s 
[?25h  Building wheel for highway-env (PEP 517) ... [?25l[?25hdone
  Building wheel for finite-mdp (setup.py) ... [?25l[?25hdone


In [None]:
# Uninstalling the highway-env package in case an issue has happened
# while coding your new environment.
# Ucomment the command below and execute it once.
# !pip uninstall -y highway-env

Uninstalling highway-env-1.1:
  Successfully uninstalled highway-env-1.1


In [2]:
# general package imports
import os
import time
# RL specific package imports
import gym
import highway_env

In [3]:
# plotting specific import statements
# corresponding version outputs
import numpy as np
print('numpy: '+np.version.full_version)
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})
import matplotlib.image as mpimg
from mpl_toolkits.mplot3d import Axes3D 
import matplotlib.animation as animation
import matplotlib
print('matplotlib: '+matplotlib.__version__)

numpy: 1.19.5
matplotlib: 3.2.2


In [4]:
# creating an instance of roundabout environment
env_h = gym.make("highway-v0")
# converting the roundabout environment into a finite mdp
mdp_h = env_h.unwrapped.to_finite_mdp()

In [5]:
print("Lane change task MDP shape: "+str(mdp_h.transition.shape))

Lane change task MDP shape: (120, 5)


In [6]:
# generic function implementation for MDP data plotting 
def plot_3d_fig(data, img_name, x_deg=-20, y_deg=-40, show_flag=False):

    if not os.path.exists('output'):
        os.makedirs('output')

    fig = plt.figure(figsize=(10,10), dpi=100)
    ax = plt.axes(projection='3d')
    X = np.arange(0, 120, 1)
    X = np.arange(0, mdp_h.transition.shape[0], 1)
    Y = np.arange(0, mdp_h.transition.shape[1], 1)
    Y, X = np.meshgrid(Y, X)
    Z = data
    ax.plot_surface(X, Y, Z, cmap='magma', rstride=1, cstride=1, linewidth=0, alpha=0.7)
    ax.view_init(x_deg, y_deg)
    plt.xlabel("States")
    plt.ylabel("Actions")
    plt.savefig('output/'+img_name)
    # To switch off the display output of plot.
    if show_flag == False:
        plt.close(fig)

In [7]:
# plotting the deterministic MDP's transition matrix outputs for all states
plot_3d_fig(mdp_h.transition, 'lane_change_task_transition_matrix.png', -25, -45)

In [8]:
# plotting the deterministic MDP's reward matrix outputs for all states
plot_3d_fig(mdp_h.reward, 'lane_change_task_reward_matrix.png', -25, -45)

In [9]:
# storing the value function calculated w/ value iteration algorithm
val_func_array = np.zeros((mdp_h.transition.shape[0], 5))
val_cumu_array = np.zeros((mdp_h.transition.shape[0], 5))

In [10]:
# this calculates evaluates the deterministic policy
# for the deterministic version of roundabout environment
def determine_policy(mdp,v, gamma=1.0):
    policy = np.zeros(mdp.transition.shape[0])
    for s in range(mdp.transition.shape[0]):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            s_ = mdp.transition[s][a]
            r = mdp.reward[s][a]
            q_sa[a] += (1 * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy

In [11]:
# value iteration algorithm's baseline implementation
def value_iteration(mdp, env, gamma=0.99):
    value = np.zeros(mdp.transition.shape[0])
    max_iterations = 10000
    eps = 1e-10

    for i in range(max_iterations):
        prev_v = np.copy(value)
        for s in range(mdp.transition.shape[0]):
            q_sa = np.zeros(env.action_space.n)
            for a in range(env.action_space.n):
                s_ = mdp.transition[s][a]
                r = mdp.reward[s][a]
                q_sa[a] += (1 * (r + gamma * prev_v[s_]))
            value[s] = max(q_sa)
            ind_ = np.argmax(q_sa)
            val_func_array[s,ind_] = max(q_sa)
            val_cumu_array[s,:] = q_sa
        if (np.sum(np.fabs(prev_v - value)) <= eps):
            print('Problem converged at iteration %d.' % (i + 1))
            break
    return value

In [12]:
# inline code execution for value iteration
# and policy determination functions
gamma = 0.99
env = gym.make('highway-v0')
mdp = env.unwrapped.to_finite_mdp()
optimal_value_func = value_iteration(mdp, env, gamma)
start_time = time.time()
policy = determine_policy(mdp, optimal_value_func, gamma)
print("Best Policy Values Determined for the MDP.\n")
print(policy)

Problem converged at iteration 2646.
Best Policy Values Determined for the MDP.

[3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 1. 1. 1. 3. 1.
 1. 1. 1. 1. 1. 0. 0. 0. 2. 2. 2. 0. 0. 0. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 1. 1. 1. 2. 2. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.]


In [13]:
# plotting the value function as output
plot_3d_fig(val_func_array, 'value_func_array.png', 40, -45)

In [14]:
plot_3d_fig(val_cumu_array, 'value_cumu_array.png', 40, 145)

In [None]:
# downloading the zip files from the output directory
!zip -r /content/output.zip /content/output/
from google.colab import files
files.download("/content/output.zip")

updating: content/output/ (stored 0%)
updating: content/output/lane_change_task_transition_matrix.png (deflated 4%)
updating: content/output/lane_change_task_reward_matrix.png (deflated 5%)
updating: content/output/value_func_array.png (deflated 4%)
updating: content/output/value_cumu_array.png (deflated 4%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>