In [None]:
# Cell 1: define fixed‑start env & rule agent

import gymnasium as gym
import numpy as np
import pandas as pd
from statistics import mean, stdev

class LunarWrapper(gym.Wrapper):
    """Annotate info['fuel_used'] = 1 when main engine fires (action==2)."""
    def reset(self, **kwargs):
        obs, info = super().reset(**kwargs)
        return obs, info

    def step(self, action):
        obs, reward, terminated, truncated, info = super().step(action)
        info["fuel_used"] = 1.0 if action == 2 else 0.0
        return obs, reward, terminated, truncated, info

def make_rule_env():
    """
    Build the environment pipeline:
      1) fixed start (no x shift)
      2) fuel tracking
    """
    base = gym.make("LunarLander-v3")
    base = LunarWrapper(base)
    return base

def rule_action(obs):
    """
    Heuristic:
      - fight downward speed
      - damp horizontal drift
      - stabilize tilt
    """
    x, y, x_dot, y_dot, theta, theta_dot, leg1, leg2 = obs

    # vertical control
    if y_dot < -0.2 or (y < 0.1 and abs(y_dot) > 0.05):
        return 2

    # horizontal damping
    if x_dot > 0.1:
        return 1
    if x_dot < -0.1:
        return 3

    # angle stabilization
    if theta > 0.05:
        return 3
    if theta < -0.05:
        return 1

    return 0


This cell defines a minimal “rule‑based” agent for LunarLander‑v3. The function rule_action(obs) then implements a simple flight controller:

If the craft is descending too fast (or about to bounce), fire the main engine.

If it’s drifting horizontally, fire the appropriate side thruster.

If it’s leaning, correct the tilt.

Otherwise, do nothing.

In [None]:
# Cell 2: run 10 000 episodes and collect metrics

env = make_rule_env()
n_episodes = 10_000

rewards = []
fuels = []
successes = 0

for i in range(n_episodes):
    obs, info = env.reset()
    done = False
    total_reward = 0.0
    total_fuel = 0.0

    while not done:
        action = rule_action(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        total_fuel += info.get("fuel_used", 0.0)
        done = terminated or truncated

    rewards.append(total_reward)
    fuels.append(total_fuel)
    if total_reward >= 200:
        successes += 1

print(f"Finished {n_episodes} episodes.")




Finished 10000 episodes.


Here I run 10000 episodes of the rule‑based policy in the random‑start environment. For each episode I accumulate:
total_reward: the sum of Gym’s step rewards,
total_fuel: the number of main‑engine firings,
successes: how many episodes achieved ≥ 200 points (the “solved” threshold).

In [None]:
# Cell 3: report & save mean reward, mean fuel, success rate

mean_r   = mean(rewards)
mean_f   = mean(fuels)
succ_pct = successes / n_episodes

print(f"Mean reward:    {mean_r:.1f}")
print(f"Mean fuel used: {mean_f:.1f}")
print(f"Success rate:   {successes}/{n_episodes} = {succ_pct:.0%}")

# save to CSV for plotting later
df = pd.DataFrame([{
    "experiment":       "rule_based_baseline_fixed",
    "mean_reward":      mean_r,
    "mean_fuel":        mean_f,
    "success_rate":     succ_pct
}])
df.to_csv("01_rule_based_baseline_metrics.csv", index=False)
print("→ metrics written to 01_rule_based_baseline_metrics.csv")


Mean reward:    -486.6
Mean fuel used: 114.4
Success rate:   874/10000 = 9%
→ metrics written to 01_rule_based_baseline_metrics.csv


This final cell simply reports the key summary metrics over the 100 episodes:
Mean reward shows overall performance.
Mean fuel used indicates efficiency.
Success rate is the fraction of landings scoring ≥ 200.