In [122]:
import gym, time, torch, os, sys, importlib
import matplotlib.pyplot as plt
import numpy as np

from torch import nn, optim

sys.path.append(os.path.abspath('..')) # Add parent directory to path
import ppo
importlib.reload(ppo) # Prevents caching issues

from ppo import PPONetwork, PPOWrapper
from ppo import parameter_sweeper, pick_parameter_sweep_results, plot_sweep_results

In [123]:
# Initialize environment with gym_kwargs
gym_kwargs = {"id": "LunarLander-v2"}
env = gym.make(**gym_kwargs)

state, info = env.reset()
print("State: ", state)
print("Info: ", info)

observation_space = env.observation_space.shape[0]
action_space = env.action_space.n

print("Observation space: ", observation_space)
print("Action space: ", action_space)

a=0
state, reward, done, truncated, info = env.step(a)

print(f"State: {state}")
print(f"Action: {a}, Reward: {reward}, Done: {done}, Truncated: {truncated}, Info: {info}")


State:  [-4.2161942e-04  1.4098197e+00 -4.2727251e-02 -4.8911892e-02
  4.9542030e-04  9.6783694e-03  0.0000000e+00  0.0000000e+00]
Info:  {}
Observation space:  8
Action space:  4
State: [-8.4352493e-04  1.4081416e+00 -4.2669326e-02 -7.4580573e-02
  9.7363547e-04  9.5660295e-03  0.0000000e+00  0.0000000e+00]
Action: 0, Reward: -1.9778205908285145, Done: False, Truncated: False, Info: {}


In [124]:
# Initialize network
input_dim = observation_space
output_dim = action_space
hidden_dims = [256, 256, 128]
policy_hidden_dims = [64, 32]
value_hidden_dims = [64, 32]

network_kwargs = {"input_dim": input_dim, "output_dim": output_dim, "hidden_dims": hidden_dims, "policy_hidden_dims": policy_hidden_dims, "value_hidden_dims": value_hidden_dims}
network = PPONetwork(**network_kwargs)

In [125]:
# Initialize PPO wrapper
gamma = 0.99
lam = 0.95
clip_epsilon = 0.3
initial_lr = 3e-4
final_lr = 1e-5
value_coef = 1
entropy_coef = 0.01
batch_epochs = 3
batch_size = 128
batch_shuffle = True
checkpointing = True
anneal_clip = True
anneal_entropy = False
anneal_lambda = False

# Initialize PPO wrapper
ppo_kwargs = {"gamma": gamma, "lam": lam, "clip_epsilon": clip_epsilon, "initial_lr": initial_lr, "final_lr": final_lr, "value_coef": value_coef, "entropy_coef": entropy_coef, "batch_size": batch_size, "batch_epochs": batch_epochs, "batch_shuffle": batch_shuffle, "checkpointing": checkpointing, "anneal_clip": anneal_clip, "anneal_entropy": anneal_entropy, "anneal_lambda": anneal_lambda}
ppo = PPOWrapper(env, network, **ppo_kwargs)

# Make sure it works
_ = ppo.train_model(10000, display=True)

Episode: 0	 avg life: 87.48, eval reward: -118.06, eval ma: -11.81
Episode: 1	 avg life: 94.67, eval reward: -156.26, eval ma: -26.25
Episode: 2	 avg life: 89.75, eval reward: -682.72, eval ma: -91.90
Episode: 3	 avg life: 111.78, eval reward: -902.61, eval ma: -172.97
Episode: 4	 avg life: 106.46, eval reward: -315.49, eval ma: -187.22
Episode: 5	 avg life: 134.95, eval reward: -1551.46, eval ma: -323.65
Episode: 6	 avg life: 132.58, eval reward: -1043.85, eval ma: -395.67
Episode: 7	 avg life: 391.29, eval reward: -2509.13, eval ma: -607.01
Episode: 8	 avg life: 813.00, eval reward: -844.75, eval ma: -630.79
Episode: 9	 avg life: 872.67, eval reward: -380.79, eval ma: -605.79
Episode: 10	 avg life: 628.75, eval reward: -1478.94, eval ma: -693.10
Episode: 11	 avg life: 512.60, eval reward: -414.00, eval ma: -665.19
Episode: 12	 avg life: 727.25, eval reward: -240.43, eval ma: -622.71
Episode: 13	 avg life: 699.50, eval reward: -286.83, eval ma: -589.13
Episode: 14	 avg life: 872.00, e

KeyboardInterrupt: 