In [1]:
# Cloning the gitub branch in the '/content/' directory.
!git clone https://github.com/eleurent/highway-env.git
!git clone https://github.com/eleurent/finite-mdp.git

Cloning into 'highway-env'...
remote: Enumerating objects: 7516, done.[K
remote: Counting objects: 100% (155/155), done.[K
remote: Compressing objects: 100% (91/91), done.[K
remote: Total 7516 (delta 70), reused 123 (delta 61), pack-reused 7361[K
Receiving objects: 100% (7516/7516), 22.15 MiB | 14.82 MiB/s, done.
Resolving deltas: 100% (5146/5146), done.
Cloning into 'finite-mdp'...
remote: Enumerating objects: 141, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 141 (delta 1), reused 4 (delta 0), pack-reused 129[K
Receiving objects: 100% (141/141), 25.21 KiB | 6.30 MiB/s, done.
Resolving deltas: 100% (45/45), done.


In [None]:
# Uninstalling the highway-env package in case an issue has happened
# while coding your new environment.
# Ucomment the command below for execution.
# !pip uninstall -y highway-env

Uninstalling highway-env-1.1:
  Successfully uninstalled highway-env-1.1


In [2]:
# installing the highway-env package.
!pip install /content/highway-env/
!pip install /content/finite-mdp/
# Note: Press the restart button when reinstalling the highway-env
# package as/if prompted by the output text below.

Processing ./highway-env
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pygame
[?25l  Downloading https://files.pythonhosted.org/packages/4b/9e/c400554dd1d0e562bd4379f35ad5023c68fc120003a58991405850f56f95/pygame-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (11.8MB)
[K     |████████████████████████████████| 11.8MB 349kB/s 
Building wheels for collected packages: highway-env
  Building wheel for highway-env (PEP 517) ... [?25l[?25hdone
  Created wheel for highway-env: filename=highway_env-1.2-cp37-none-any.whl size=92210 sha256=9d06c757c69499d772d6eea6458a65bcf52143c7c7b3e0e4a2b2f84e1f91438e
  Stored in directory: /root/.cache/pip/wheels/86/f8/1a/561333f2df5a08999032373dfa6641cc1cd21b091cc818e8ff
Successfully built highway-env
Installing collected packages: pygame, highway-env
Successfully installed highway-env-

In [3]:
# general package imports
import os
import time
# RL specific package imports
import gym
import highway_env

In [4]:
# plotting specific import statements
import numpy as np
print('numpy: '+np.version.full_version)
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})
import matplotlib.image as mpimg
from mpl_toolkits.mplot3d import Axes3D 
import matplotlib.animation as animation
import matplotlib
print('matplotlib: '+matplotlib.__version__)

numpy: 1.19.5
matplotlib: 3.2.2


In [5]:
# creating an instance of roundabout environment
env_h = gym.make("highway-v0")
# converting the roundabout environment into a finite mdp
mdp_h = env_h.unwrapped.to_finite_mdp()

In [6]:
print("Lane change task MDP shape: "+str(mdp_h.transition.shape))

Lane change task MDP shape: (120, 5)


In [7]:
# generic function implementation for MDP data plotting 
def plot_3d_fig(data, img_name, x_deg=-20, y_deg=-40, show_flag=False):

    if not os.path.exists('output'):
        os.makedirs('output')

    fig = plt.figure(figsize=(10,10), dpi=100)
    ax = plt.axes(projection='3d')
    X = np.arange(0, 120, 1)
    X = np.arange(0, mdp_h.transition.shape[0], 1)
    Y = np.arange(0, mdp_h.transition.shape[1], 1)
    Y, X = np.meshgrid(Y, X)
    Z = data
    ax.plot_surface(X, Y, Z, cmap='magma', rstride=1, cstride=1, linewidth=0, alpha=0.7)
    ax.view_init(x_deg, y_deg)
    plt.xlabel("States")
    plt.ylabel("Actions")
    plt.savefig('output/'+img_name)
    # To switch off the display output of plot.
    if show_flag == False:
        plt.close(fig)

In [8]:
# plotting the deterministic MDP's transition matrix outputs for all states
plot_3d_fig(mdp_h.transition, 'lane_change_task_transition_matrix.png')

In [9]:
# plotting the deterministic MDP's reward matrix outputs for all states
plot_3d_fig(mdp_h.reward, 'lane_change_task_reward_matrix.png')

In [20]:
# storing the value function calculated w/ value iteration algorithm
val_func_array = np.zeros((mdp.transition.shape[0], 5))
val_cumu_array = np.zeros((mdp.transition.shape[0], 5))

In [21]:
# this calculates evaluates the deterministic policy
# for the deterministic version of roundabout environment
def determine_policy(v, gamma=1.0):
    policy = np.zeros(mdp.transition.shape[0])
    for s in range(mdp.transition.shape[0]):
        q_sa = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            s_ = mdp.transition[s][a]
            r = mdp.reward[s][a]
            q_sa[a] += (1 * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy

In [22]:
# value iteration algorithm's baseline implementation
def value_iteration(env, gamma=0.99):
    value = np.zeros(mdp.transition.shape[0])
    max_iterations = 10000
    eps = 1e-10

    for i in range(max_iterations):
        prev_v = np.copy(value)
        for s in range(mdp.transition.shape[0]):
            q_sa = np.zeros(env.action_space.n)
            for a in range(env.action_space.n):
                s_ = mdp.transition[s][a]
                r = mdp.reward[s][a]
                q_sa[a] += (1 * (r + gamma * prev_v[s_]))
            value[s] = max(q_sa)
            ind_ = np.argmax(q_sa)
            val_func_array[s,ind_] = max(q_sa)
            val_cumu_array[s,:] = q_sa
        if (np.sum(np.fabs(prev_v - value)) <= eps):
            print('Problem converged at iteration %d.' % (i + 1))
            break
    return value

In [23]:
# inline code execution for value iteration
# and policy determination functions
gamma = 0.99
env = gym.make('highway-v0')
mdp = env.unwrapped.to_finite_mdp()
optimal_value_func = value_iteration(env, gamma)
start_time = time.time()
policy = determine_policy(optimal_value_func, gamma)
print("Best Policy Values Determined for the MDP.\n")
print(policy)

Problem converged at iteration 2653.
Best Policy Values Determined for the MDP.

[3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [24]:
# plotting the value function as output
plot_3d_fig(val_func_array, 'value_func_array.png', 40, -45)

In [25]:
plot_3d_fig(val_func_array, 'value_func_array.png', 40, -45)