In [None]:
# Cell 1: wrappers, callback, train & eval functions

import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, VecMonitor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.utils import get_linear_fn
from statistics import mean, stdev

# Random-start wrapper
class RandomStartWrapper(gym.Wrapper):
    def reset(self, **kwargs):
        obs, info = super().reset(**kwargs)
        shift = self.np_random.uniform(-0.4, 0.4)
        uw = self.unwrapped
        uw.lander.position = (uw.lander.position.x + shift, uw.lander.position.y)
        obs = obs.copy(); obs[0] += shift
        return obs, info
    def step(self, action):
        return super().step(action)

# LunarWrapper for fuel & landing error
class LunarWrapper(gym.Wrapper):
    def reset(self, **kwargs):
        obs, info = super().reset(**kwargs)
        return obs, info
    def step(self, action):
        obs, reward, term, trunc, info = super().step(action)
        info["fuel_used"]   = 1.0 if action==2 else 0.0
        info["landing_err"] = np.linalg.norm(obs[0:2])
        return obs, reward, term, trunc, info

# corrected callback: no configure() call
class MetricsCallback(BaseCallback):
    def _on_step(self) -> bool:
        for info in self.locals["infos"]:
            if "episode" in info:
                self.logger.record("custom/fuel_used",   info["fuel_used"])
                self.logger.record("custom/landing_err", info["landing_err"])
        return True

def make_env():
    e = gym.make("LunarLander-v3")
    e = RandomStartWrapper(e)
    e = LunarWrapper(e)
    return e

def train_one(ent_coef, timesteps=200_000):
    env = DummyVecEnv([make_env])
    env = VecMonitor(env)
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
    model = PPO(
        policy="MlpPolicy",
        env=env,
        verbose=0,
        n_steps=2048,
        batch_size=64,
        learning_rate=get_linear_fn(5e-4,1e-5,0.1),
        clip_range=0.2,
        ent_coef=ent_coef,
        tensorboard_log="./ppo_lunar_tb",
    )
    model.learn(total_timesteps=timesteps, callback=MetricsCallback())
    return model, env

def evaluate(model, env, n_episodes=20):
    rewards, fuels = [], []
    for _ in range(n_episodes):
        obs = env.reset(); done=False; r_sum=0; f_sum=0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, dones, infos = env.step(action)
            r_sum += reward[0]
            f_sum += infos[0]["fuel_used"]
            done = dones[0]
        rewards.append(r_sum); fuels.append(f_sum)
    return mean(rewards), mean(fuels)


I define two wrappers—one to randomize the start-x each episode, one to log fuel and landing error—then build helper functions train_one and evaluate. This sets us up to easily train PPO with different entropy coefficients.

In [None]:
# Cell 2: entropy sweep (and dump CSV)

import pandas as pd    # ← new

ent_values = [0.0, 0.005, 0.01, 0.02, 0.05]
results = []

for ent in ent_values:
    print(f"\n=== Training with ent_coef = {ent} ===")
    model, env = train_one(ent, timesteps=200_000)
    env.training = False
    env.norm_reward = False
    mean_r, mean_f = evaluate(model, env, n_episodes=20)
    print(f"ent_coef={ent} → mean reward {mean_r:.1f}, mean fuel {mean_f:.1f}")
    results.append((ent, mean_r, mean_f))

# pick best by highest reward
best = max(results, key=lambda x: x[1])
print(f"\nBest ent_coef: {best[0]} with reward {best[1]:.1f}, fuel {best[2]:.1f}")
best_ent = best[0]

# write out the sweep metrics for plotting
df = pd.DataFrame(results, columns=["ent_coef","mean_reward","mean_fuel"])
df.to_csv("06_entropy_sweep_metrics.csv", index=False)
print("→ saved sweep metrics to 06_entropy_sweep_metrics.csv")



=== Training with ent_coef = 0.0 ===
ent_coef=0.0 → mean reward -79.1, mean fuel 573.2

=== Training with ent_coef = 0.005 ===
ent_coef=0.005 → mean reward -98.0, mean fuel 571.0

=== Training with ent_coef = 0.01 ===
ent_coef=0.01 → mean reward -28.3, mean fuel 571.9

=== Training with ent_coef = 0.02 ===
ent_coef=0.02 → mean reward -42.9, mean fuel 574.5

=== Training with ent_coef = 0.05 ===
ent_coef=0.05 → mean reward -42.4, mean fuel 574.8

Best ent_coef: 0.01 with reward -28.3, fuel 571.9
→ saved sweep metrics to 06_entropy_sweep_metrics.csv


Loop over ent_coef values, train each for 200 K steps, freeze normalization, and evaluate 20 episodes apiece. I then pick the coefficient that gave the highest average reward.

In [16]:
# Cell 3: playback & final eval of best ent_coef

print(f"Re-training final model with ent_coef = {best_ent}")
final_model, final_env = train_one(best_ent, timesteps=1_500_000)

# playback
import time
from stable_baselines3.common.vec_env import VecNormalize, VecMonitor

def make_play_env():
    e = gym.make("LunarLander-v3", render_mode="human")
    e = RandomStartWrapper(e)
    e = LunarWrapper(e)
    return e

play_env = DummyVecEnv([make_play_env])
play_env = VecMonitor(play_env)
play_env = VecNormalize.load("ppo_vecnormalize.pkl", play_env)
play_env.training=False; play_env.norm_reward=False

obs = play_env.reset()
for i in range(300):
    action, i = final_model.predict(obs, deterministic=True)
    obs, i, dones, i = play_env.step(action)
    play_env.render()
    time.sleep(0.02)
    if dones[0]: break
play_env.close()

# final evaluation
final_env.training=False; final_env.norm_reward=False
fr, ff = evaluate(final_model, final_env, n_episodes=50)
print(f"Final performance: reward {fr:.1f}, fuel {ff:.1f}")


Re-training final model with ent_coef = 0.01
Final performance: reward 236.9, fuel 106.0


Take the best entropy coefficient, retrain PPO for 1.5 M steps, watch it land in human-render mode, and finally run 50 evaluation episodes to print out the mean reward and fuel usage.