In [1]:
import argparse

import gym

In [2]:
def build_arg_parser():
  parser = argparse.ArgumentParser(description='Run an environment')
  parser.add_argument('--input-env', dest='input_env', required=True,
  choices=['cartpole', 'mountaincar', 'pendulum', 'taxi', 'lake'], 
            help='Specify the name of the environment')
  return parser

In [3]:
# Define the main function and parse the input arguments,
def run(input_env):
    name_map = {'cartpole': 'CartPole-v1', 
                'mountaincar': 'MountainCar-v0',
                'pendulum': 'Pendulum-v0',
                'taxi': 'Taxi-v1',
                'lake': 'FrozenLake-v0'}

# Create a mapping from input argument string to the names of the environments as specified in the OpenAI Gym package,
 # Create the environment and reset it

    env = gym.make(name_map[input_env])
    env.reset()


# Iterate 1000 times and take action during each step:
    for _ in range(1000):
      
        env.render()


In [5]:
pip install pygame

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Downloading pygame-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.4 MB/s 
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.2


In [6]:

import os
os.environ['SDL_VIDEODRIVER']='dummy'
import pygame
pygame.display.set_mode((640,480))
run("cartpole")

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  "You are calling render method, "


In [7]:
run("mountaincar")

In [8]:
# Building a learning

In [9]:
def build_arg_parser():
    parser = argparse.ArgumentParser(description='Run an environment')
    parser.add_argument('--input-env', dest='input_env', required=True,
            choices=['cartpole', 'mountaincar', 'pendulum'], 
            help='Specify the name of the environment')
    return parser

def run(input_env):

    name_map = {'cartpole': 'CartPole-v1', 
                'mountaincar': 'MountainCar-v0',
                'pendulum': 'Pendulum-v0'}

    env = gym.make(name_map[input_env])
 
    for _ in range(20):
        observation = env.reset()

        for i in range(100):
            env.render()

            print(observation)
 
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print('Episode finished after {} timesteps'.format(i+1))
                break

In [10]:
run("cartpole")

[ 0.01614045 -0.03132908 -0.04496663 -0.00101049]
[ 0.01551387 -0.22577827 -0.04498684  0.27715272]
[ 0.01099831 -0.4202305  -0.03944379  0.5553143 ]
[ 0.0025937  -0.22457758 -0.0283375   0.2504697 ]
[-0.00189785 -0.41928366 -0.02332811  0.53408146]
[-0.01028353 -0.6140699  -0.01264648  0.8193235 ]
[-0.02256493 -0.41877717  0.00373999  0.5226898 ]
[-0.03094047 -0.22370805  0.01419379  0.23118778]
[-0.03541463 -0.02879176  0.01881754 -0.05698436]
[-0.03599047 -0.2241784   0.01767786  0.24157573]
[-0.04047403 -0.41954836  0.02250937  0.53978187]
[-0.048865   -0.22474994  0.03330501  0.25427547]
[-0.05336    -0.4203312   0.03839052  0.5572746 ]
[-0.06176662 -0.6159705   0.04953601  0.86180127]
[-0.07408603 -0.8117307   0.06677204  1.169639  ]
[-0.09032065 -0.6175377   0.09016482  0.89861506]
[-0.1026714  -0.81375843  0.10813712  1.2182231 ]
[-0.11894657 -0.62018394  0.13250157  0.9612878 ]
[-0.13135025 -0.42706764  0.15172733  0.71299404]
[-0.1398916  -0.23433565  0.16598722  0.47165427]


 **Q learning** 

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

In [12]:
# Define the reward/link connection graph

R = np.matrix([
        [-1, -1, -1, -1, 0, -1],
        [-1, -1, -1, 0, -1, 100],
        [-1, -1, -1, 0, -1, -1],
        [-1, 0, 0, -1, 0, -1],
        [ 0, -1, -1, 0, -1, 100],
        [-1, 0, -1, -1, 0, 100]
]).astype("float32")
Q = np.zeros_like(R)

The -1’s in the table means there isn’t a link between nodes. For example, State ‘a’ cannot go to State ‘b’

In [14]:
# Learning parameter
gamma = 0.8

# Initialize random state
initial_state = np.random.randint(0, 4)

def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act

def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act,1))
    return next_action

In [15]:
def update(current_state, action, gamma):
    # Update the Q matrix according to the path selected and the Q learning algorithm
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    
    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

In [16]:
# Get available actions in the current state
available_act = available_actions(initial_state)
# Sample next action to be performed
action = sample_next_action(available_act)
# Train over 100 iterations, re-iterate the process above).
for i in range(100):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state,action,gamma)
# Normalize the "trained" Q matrix
print ("Trained Q matrix: \n", Q/np.max(Q)*100)

Trained Q matrix: 
 [[  0.           0.           0.           0.          80.00000119
    0.        ]
 [  0.           0.           0.          64.00000453   0.
  100.        ]
 [  0.           0.           0.          64.00000453   0.
    0.        ]
 [  0.          80.00000119  43.55744123   0.          80.00000119
    0.        ]
 [ 64.00000453   0.           0.          64.00000453   0.
  100.        ]
 [  0.          80.00000119   0.           0.          80.00000119
  100.        ]]


In [17]:
available_act = available_actions(initial_state)
action = sample_next_action(available_act)

In [18]:
# Train over 100 iterations, re-iterate the process above).
for i in range(100):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

In [19]:
# Normalize the "trained" Q matrix
print("Trained Q matrix: \n", Q/np.max(Q)*100)

Trained Q matrix: 
 [[  0.           0.           0.           0.          78.83652449
    0.        ]
 [  0.           0.           0.          63.06922436   0.
  100.        ]
 [  0.           0.           0.          63.06922436   0.
    0.        ]
 [  0.          66.90068245  45.08885145   0.          78.83652449
    0.        ]
 [ 60.45140624   0.           0.          63.06922436   0.
   98.54565859]
 [  0.          80.00000119   0.           0.          78.83652449
  100.        ]]


In [20]:
current_state = 2
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index

In [21]:
# Print selected sequence of steps
print (f"Best sequence path: {steps}")

Best sequence path: [2, 3, 4, 5]
