<a href="https://colab.research.google.com/github/baronase/ml-residual-pendulum/blob/master/notebooks/pendulum_residual_rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip -q install "gymnasium[classic-control]" stable-baselines3 torch numpy matplotlib

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/188.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.0/188.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# --- REPO SETUP --- Only run once
!git clone https://github.com/baronase/ml-residual-pendulum.git

Cloning into 'ml-residual-pendulum'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 32 (delta 8), reused 16 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (32/32), 70.33 KiB | 5.41 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [4]:
%cd ml-residual-pendulum
!git pull

/content/ml-residual-pendulum
Already up to date.


In [8]:
import sys
!{sys.executable} -m pip uninstall -y pendulum_residual_rl
!{sys.executable} -m pip install -e .


Found existing installation: pendulum_residual_rl 0.1.0
Uninstalling pendulum_residual_rl-0.1.0:
  Successfully uninstalled pendulum_residual_rl-0.1.0
Obtaining file:///content/ml-residual-pendulum
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pendulum_residual_rl
  Building editable for pendulum_residual_rl (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pendulum_residual_rl: filename=pendulum_residual_rl-0.1.0-0.editable-py3-none-any.whl size=1319 sha256=132ee0c6f22b5ffd5f95fd9267de38e157055d9f5ca5549c9239534d5e37a68e
  Stored in directory: /tmp/pip-ephem-wheel-cache-43en8x8v/wheels/60/f4/4f/d02c1d5f55036909f72c6126ce700a1c121f66bb9d9c603732
Successfully built pendulum_residual_rl
Installing collected packages: pendulum_residual

In [5]:
#should restart the session here

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
#verifying import
import pendulum_residual_rl
print("ok", pendulum_residual_rl.__file__)

ok /content/ml-residual-pendulum/src/pendulum_residual_rl/__init__.py


In [2]:
import gymnasium as gym
import numpy as np

env = gym.make("Pendulum-v1")
obs, info = env.reset(seed=0)

print("obs:", obs)
print("obs shape:", obs.shape)
print("action space:", env.action_space)
print("obs space:", env.observation_space)

# take a few random steps
for i in range(5):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    print(i, "action:", action, "reward:", reward)


obs: [ 0.6520163   0.758205   -0.46042657]
obs shape: (3,)
action space: Box(-2.0, 2.0, (1,), float32)
obs space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
0 action: [-1.9198306] reward: -0.7654410586426768
1 action: [-1.4690555] reward: -0.7305575903329619
2 action: [0.4017412] reward: -0.7420679670543086
3 action: [-1.5314977] reward: -0.8741116089702341
4 action: [-0.26675636] reward: -1.0480452584334405


In [3]:
%cd ml-residual-pendulum/

/content/ml-residual-pendulum


In [3]:
!pwd
!ls

/content/ml-residual-pendulum
notebooks  pyproject.toml  README.md  src


In [7]:
%%writefile src/pendulum_residual_rl/controllers.py
from __future__ import annotations

from dataclasses import dataclass
import math
import numpy as np
#lalala

def obs_to_theta_theta_dot(obs: np.ndarray) -> tuple[float, float]:
    """Pendulum-v1 obs = [cos(theta), sin(theta), theta_dot], with theta in [-pi, pi]."""
    cos_t, sin_t, theta_dot = float(obs[0]), float(obs[1]), float(obs[2])
    theta = math.atan2(sin_t, cos_t)
    return theta, theta_dot


def wrap_to_pi(angle: float) -> float:
    """Wrap angle to [-pi, pi]."""
    return (angle + math.pi) % (2 * math.pi) - math.pi


@dataclass
class EnergyPDController:
    """
    Baseline controller: energy-shaping swing-up + PD stabilize near upright.

    Conventions:
      - Pendulum-v1 uses theta=0 as upright.
      - We use a normalized energy:
            E = 0.5 * theta_dot^2 + (1 + cos(theta))
        so:
            E* (upright, zero velocity) = 2.0
            E (hanging down at pi, zero velocity) = 0.0

    Action is torque u in [-max_torque, max_torque].
    """
    # PD gains (used near upright)
    kp: float
    kd: float

    # Energy shaping gain (used far from upright)
    ke: float

    # Switch/blend region (radians)
    theta_switch: float # 0.5 ~ around ~28.6 degrees

    # Residual scale not used here (for later); controller outputs full torque
    max_torque: float

    def __init__(self, kp_in: float = 10.0, kd_in: float = 1.0,
                 ke_in: float = 2.0, theta_switch_in: float = 0.5,
                 max_torque_in: float = 2.0, log_interval_in = 10) -> None:
        self.kp = kp_in
        self.kd = kd_in
        self.ke = ke_in
        self.theta_switch = theta_switch_in
        self.max_torque = max_torque_in
        # DEBUG
        self.step_for_print = 0
        self.log_interval = log_interval_in

        # pendulum
        self.pen_l = 1
        self.pen_m = 1
        self.pen_I = self.pen_m * (self.pen_l ** 2) / 3.0


    def energy(self, theta: float, theta_dot: float) -> float:
        # Ek = 0.5 * I * omega**2
        Ek = 0.5 * self.pen_I * theta_dot**2
        # Ep = m * g * l * (1.0 + math.cos(theta))   # Pendulum-v1 convention
        Ep = 1 * 10 * self.pen_l / 2 * (1.0 + math.cos(theta))
        return Ek + Ep

    # def energy(self, theta: float, theta_dot: float) -> float:  # <--- ORIGINAL
    #     return 0.5 * (theta_dot ** 2) + (1.0 + math.cos(theta))

    def u_pd(self, theta: float, theta_dot: float) -> float:
        # stabilize around theta=0
        theta = wrap_to_pi(theta)
        return -self.kp * theta - self.kd * theta_dot

    def u_energy(self, theta: float, theta_dot: float) -> float:
        # energy target: upright (theta=0, theta_dot=0) => E*=2
        e = self.energy(theta, theta_dot) - 10.0  # positive => too much energy
        # Pumping direction: "push with the swing" when energy is low, oppose when high
        # direction = math.copysign(1.0, theta_dot * math.cos(theta) + 1e-6)  # <-- ORIGINAL
        direction = math.copysign(1.0, theta_dot + 1e-6)
        return -self.ke * e * direction


    def blend_weight(self, theta: float) -> float:
        """
        Weight for PD vs energy:
          w=1 near upright, w=0 when |theta| >= theta_switch.
        """
        a = abs(wrap_to_pi(theta))
        if a >= self.theta_switch:
            return 0.0
        # smooth-ish ramp (quadratic)
        x = 1.0 - (a / self.theta_switch)
        return x * x

    def __call__(self, obs: np.ndarray) -> np.ndarray:
        # DEBUG
        self.step_for_print += 1

        theta, theta_dot = obs_to_theta_theta_dot(obs)
        u_e = self.u_energy(theta, theta_dot)
        u_p = self.u_pd(theta, theta_dot)

        if self.step_for_print % self.log_interval == 0:
          print(f"step: {self.step_for_print} - x:{obs[0]:.2f}, y:{obs[1]:.2f} | theta:{theta:.2f} theta_dot:{theta_dot:.2f}")


        w = self.blend_weight(theta)
        u = (1.0 - w) * u_e + w * u_p
        if self.step_for_print % self.log_interval == 0:
          print(f"step: {self.step_for_print} - u_e:{u_e:.2f}, u_p:{u_p:.2f} | w:{w} u:{u:.2f}")
        # clip to env action bounds
        u = float(np.clip(u, -self.max_torque, self.max_torque))
        return np.array([u], dtype=np.float32)


Overwriting src/pendulum_residual_rl/controllers.py


In [3]:
def print_plots(thetas, dots, us, rewards):
  plt.figure(figsize=(2, 2))
  plt.plot(thetas)
  plt.title("theta(t)")
  plt.xlabel("t")
  plt.ylabel("theta (rad)")
  # plt.figsize(2,2)
  plt.show()

  plt.figure(figsize=(2, 2))
  plt.plot(dots)
  plt.title("dot(t)")
  plt.xlabel("t")
  plt.ylabel("angular velocity")
  plt.show()


  plt.figure(figsize=(2, 2))
  plt.plot(us)
  plt.title("u(t)")
  plt.xlabel("t")
  plt.ylabel("torque")
  plt.show()

  plt.figure(figsize=(2, 2))
  plt.plot(rewards)
  plt.title("reward(t)")
  plt.xlabel("t")
  plt.ylabel("reward")
  plt.show()


In [4]:
def obs_to_theta_theta_dot(obs: np.ndarray) -> tuple[float, float]:
    """Pendulum-v1 obs = [cos(theta), sin(theta), theta_dot], with theta in [-pi, pi]."""
    cos_t, sin_t, theta_dot = float(obs[0]), float(obs[1]), float(obs[2])
    theta = math.atan2(sin_t, cos_t)
    return theta, theta_dot

class TestCtrl:
  # tests
  def __init__(self):
    self.hit_bot = False
    self.step_for_print = 0
    self.log_interval = 10
  def __call__(self, obs: np.ndarray):
    theta, theta_dot = obs_to_theta_theta_dot(obs)

    self.step_for_print += 1
    if self.step_for_print % self.log_interval == 0:
      print(f"step: {self.step_for_print} - x:{obs[0]}, y:{obs[1]} | theta:{theta} theta_dot:{theta_dot}")

    self.log_interval

    if (3.13 < theta or theta < -3.13) and -0.01 < theta_dot < 0.01:
      print(f"HIT BOT NOW: step:{self.step_for_print}")
      self.hit_bot = True

    if self.hit_bot:
      return np.array([2], dtype=np.float32)
    else:
      u = -np.sign(theta_dot) * 0.2
      return np.array([u], dtype=np.float32)

In [5]:
env = gym.make("Pendulum-v1")
p = env.unwrapped   # unwrap TimeLimit / other wrappers

print("m =", p.m)
print("l =", p.l)
print("g =", p.g)
print("dt =", p.dt)
print("max_torque =", p.max_torque)

m = 1.0
l = 1.0
g = 10.0
dt = 0.05
max_torque = 2.0


In [6]:
import gymnasium as gym
import numpy as np
import math
import matplotlib.pyplot as plt

from pendulum_residual_rl.controllers import EnergyPDController, obs_to_theta_theta_dot

env = gym.make("Pendulum-v1")

if True:
  print("using baseline control !!")
  ctrl = EnergyPDController(log_interval_in=100000)
  print(f"kp:{ctrl.kp}, kd:{ctrl.kd}, ke:{ctrl.ke}, theta_switch:{ctrl.theta_switch}")
else:
  print("using test control !!")
  ctrl = TestCtrl()

def rollout(seed=0, steps=200):
    obs, _ = env.reset(seed=seed)
    thetas, dots, us, rewards = [], [], [], []
    for _ in range(steps):
        u = ctrl(obs)
        obs, r, term, trunc, _ = env.step(u)
        theta, theta_dot = obs_to_theta_theta_dot(obs)
        thetas.append(theta)
        dots.append(theta_dot)
        us.append(float(u[0]))
        rewards.append(r)
        if term or trunc:
            break
    return np.array(thetas), np.array(dots), np.array(us), np.array(rewards)

if True:
  print(f"running a few seeds")
  for i in (0,1,2,3,4,5):
    thetas, dots, us, rewards = rollout(seed=i, steps=200)
    print(f"episode {i} return:", rewards.sum())
else:
  _seed = 3
  print(f"doing a single test run, seed={_seed}")
  thetas, dots, us, rewards = rollout(seed=_seed, steps=200)
  print_plots(thetas, dots, us, rewards)
  print("episode return:", rewards.sum())


using baseline control !!
kp:10.0, kd:1.0, ke:2.0, theta_switch:0.5
running a few seeds
episode 0 return: -623.856084266057
episode 1 return: -625.8786147067308
episode 2 return: -617.6994392791272
episode 3 return: -731.6865538145512
episode 4 return: -739.4429950192113
episode 5 return: -616.8092758256926


In [8]:
# measure - eval over many episodes
import gymnasium as gym
import numpy as np
from pendulum_residual_rl.controllers import EnergyPDController

env = gym.make("Pendulum-v1")
ctrl = EnergyPDController(log_interval_in=1000000)

def eval_controller(n_episodes=50, seed=0, steps=200):
    returns = []
    for i in range(n_episodes):
        obs, _ = env.reset(seed=seed + i)
        total = 0.0
        for _ in range(steps):
            action = ctrl(obs)
            obs, r, term, trunc, _ = env.step(action)
            total += r
            if term or trunc:
                break
        returns.append(total)
    return float(np.mean(returns)), float(np.std(returns))

mean_r, std_r = eval_controller(n_episodes=50, seed=0)
print("Baseline mean return:", mean_r)
print("Baseline std:", std_r)

Baseline mean return: -637.5807364919718
Baseline std: 102.64590720018695


In [13]:
for i, th in enumerate(thetas[:40]):
  print(f"{i}:{th}")

0:0.8623372145204976
1:0.8978254685056526
2:0.9645098370566759
3:1.0551227925203128
4:1.1633593412750594
5:1.2910261045384288
6:1.4397348130989542
7:1.610621913968565
8:1.8339792904919092
9:2.1085454094662355
10:2.4303189156405267
11:2.791572428076365
12:-3.1025000139048444
13:-2.702500013884929
14:-2.303441957743715
15:-1.9172616745146622
16:-1.5513531005513812
17:-1.2379374441750737
18:-0.9749634849601424
19:-0.7580275744029057
20:-0.5818725405869232
21:-0.44132713389220357
22:-0.3317994633422614
23:-0.2494872358125836
24:-0.18971203194877823
25:-0.1414075421217145
26:-0.09998678806905432
27:-0.06331202356062039
28:-0.03053416457667807
29:-0.0015975837016191084
30:0.023069321004527032
31:0.04349634877228149
32:0.06007453807106058
33:0.07331103264273822
34:0.08371224453962291
35:0.09173195957219274
36:0.09775442002895507
37:0.10209400972702624
38:0.10500181461708553
39:0.10667442392984984


In [14]:
for i, rw in enumerate(rewards[:70]):
  print(f"{i}:{rw}")

0:-0.761989540932331
1:-0.7442387858394676
2:-0.8565295386319812
3:-1.1089945465054643
4:-1.4457127546539006
5:-1.826010784388556
6:-2.3227004147041397
7:-2.961407625482968
8:-3.766199093792122
9:-5.36302066184012
10:-7.465426110551326
11:-10.051978044557586
12:-13.017039931399246
13:-15.685859344542948
14:-13.707506325902926
15:-11.679738292843197
16:-9.645300670115189
17:-7.766259779711904
18:-5.465664155496559
19:-3.7207658881863535
20:-2.461053288967432
21:-1.5837992449660607
22:-0.988890205516453
23:-0.593943330539573
24:-0.3363903974537531
25:-0.17925766306633495
26:-0.11337450315774865
27:-0.07864236461372345
28:-0.05785126213715515
29:-0.04403711327682746
30:-0.033810642195259664
31:-0.025333731104266937
32:-0.019116231568230762
33:-0.015158550740924768
34:-0.012936637110720642
35:-0.01187626076754629
36:-0.011512019224555584
37:-0.011514212140523986
38:-0.0116671513131882
39:-0.011837949248458204
40:-0.011949556093699921
41:-0.011960990583066504
42:-0.011865363329239188
43:-0.