In [3]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"

import gymnasium as gym 
env = gym.make('LunarLander-v2',render_mode="human") 

env.reset()
# sample action:
# sample action taking values from 0, 1, 2, or 3. 
# 0 means do nothing, 1 means fire the left engine, 2 means fire the bottom engine, and 3 means fire the right engine
print("sample action:", env.action_space.sample())

# observation space shape:
print("observation space shape:", env.observation_space.shape)

# s[0] is the horizontal coordinate
# s[1] is the vertical coordinate
# s[2] is the horizontal speed
# s[3] is the vertical speed
# s[4] is the angle
# s[5] is the angular speed
# s[6] 1 if first leg has contact, else 0
# s[7] 1 if second leg has contact, else 0
# sample observation:
print("sample observation:", env.observation_space.sample())
env.close()

sample action: 2
observation space shape: (8,)
sample observation: [-57.407875    6.028112   -4.434457    4.4309177   2.754967   -1.5039306
   0.576899    0.7412092]


# random strategy

In [14]:
env = gym.make('LunarLander-v2', render_mode='rgb_array')  # render_mode="human" for vis
env.reset()

for step in range(200):
    env.render()
    # take random action
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    if step % 50 == 0:
        print('observation:', observation[:2], 'reward:', reward, 'terminated:', terminated)
env.close()

observation: [8.3894731e-04 1.3945715e+00] reward: -1.7473469618262243 terminated: False
observation: [0.03988714 0.50022155] reward: 4.268718646314238 terminated: False
observation: [ 0.09345112 -0.0274681 ] reward: -100 terminated: True
observation: [ 9.820118e-02 -3.387928e-06] reward: -100 terminated: True


# Actor-Critic Strategy


In [16]:
from stable_baselines3 import A2C

env = gym.make('LunarLander-v2', render_mode='rgb_array')  # render_mode="human" for vis
env.reset()
model = A2C('MlpPolicy', env, verbose=1)
learning_steps = 100
model.learn(total_timesteps=learning_steps)
# number of times to play
episodes = 2
# for visualizing the game play
vec_env = model.get_env()
obs = vec_env.reset()

for episode in range(episodes):
    terminated = False
    while not terminated:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, terminated, info = vec_env.step(action)
        vec_env.render()
        print('obs:', obs[:2], 'reward:', reward, 'terminated:', terminated)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
obs: [[ 0.00722437  1.4013331   0.3581387  -0.22597626 -0.00595818 -0.03447321
   0.          0.        ]] reward: [0.08804659] terminated: [False]
obs: [[ 0.01069546  1.3956375   0.34647617 -0.2531278  -0.00534189  0.01232708
   0.          0.        ]] reward: [0.03701944] terminated: [False]
obs: [[ 0.01407347  1.3893523   0.33482566 -0.27933043 -0.00238952  0.05905287
   0.          0.        ]] reward: [0.19553111] terminated: [False]
obs: [[ 0.0173584   1.3824661   0.32313198 -0.30605882  0.00290483  0.10589688
   0.          0.        ]] reward: [-0.2992267] terminated: [False]
obs: [[ 0.0205575   1.3749764   0.31238192 -0.33290085  0.0103502   0.14892116
   0.          0.        ]] reward: [-1.1746078] terminated: [False]
obs: [[ 0.02367172  1.3668976   0.30174458 -0.35916033  0.01992056  0.1914246
   0.          0.        ]] reward: [-1.4418918] terminated: [False]
obs: [[ 0.02672043