In [1]:
import gymnasium as gym

In [2]:
# Create our training environment - a cart with a pole that needs balancing
env = gym.make("CartPole-v1", render_mode="human")

# Reset environment to start a new episode
observation, info = env.reset()
# observation: what the agent can "see" - cart position, velocity, pole angle, etc.
# info: extra debugging information (usually not needed for basic learning)

print(f"Starting observation: {observation}")
# Example output: [ 0.01234567 -0.00987654  0.02345678  0.01456789]
# [cart_position, cart_velocity, pole_angle, pole_angular_velocity]

Starting observation: [-0.03998471 -0.00902703  0.03400657 -0.01990803]


# Physics Model: CartPole-v1 Dynamics

To predict the next state of the environment, we implement the exact **Equations of Motion** used in the OpenAI Gym source code. This allows us to perform a "Lookahead" step to verify which action yields a better outcome.

### 1. System Constants
The environment uses fixed physical properties to simulate the dynamics:
* **Gravity ($g$):** $9.8 \, m/s^2$
* **Mass of Cart ($M_c$):** $1.0 \, kg$
* **Mass of Pole ($M_p$):** $0.1 \, kg$
* **Total Mass ($M$):** $M_c + M_p = 1.1 \, kg$
* **Pole Half-Length ($L$):** $0.5 \, m$ (Distance to center of mass)
* **Force Magnitude ($F$):** $10.0 \, N$
* **Time Step ($\tau$):** $0.02 \, s$

### 2. State Variables
The system is defined by a 4-dimensional state vector:
1.  $x$: Cart Position ($0$ = Center)
2.  $\dot{x}$: Cart Velocity
3.  $\theta$: Pole Angle ($0$ = Upright, Radians)
4.  $\dot{\theta}$: Pole Angular Velocity

### 3. Equations of Motion
We derive the accelerations using Newtonian mechanics.

**A. Intermediate Helper Term**
First, we calculate a temporary term representing the effective force on the pole's center of mass (incorporating the applied force and centrifugal force):
$$\text{temp} = \frac{F_{app} + M_p L \dot{\theta}^2 \sin\theta}{M}$$

**B. Angular Acceleration ($\ddot{\theta}$)**
This determines how the pole's rotation changes. It balances gravity against the cart's acceleration:
$$\ddot{\theta} = \frac{g \sin\theta - \cos\theta \cdot \text{temp}}{L \left( \frac{4}{3} - \frac{M_p \cos^2\theta}{M} \right)}$$

**C. Linear Acceleration ($\ddot{x}$)**
This determines how the cart's velocity changes:
$$\ddot{x} = \text{temp} - \frac{M_p L \ddot{\theta} \cos\theta}{M}$$

### 4. Euler Integration (Next State Update)
The simulator assumes discrete time steps. We calculate the next state ($t+1$) by adding the change over time $\tau$:

* **New Position:** $x_{t+1} = x_t + \tau \cdot \dot{x}_t$
* **New Velocity:** $\dot{x}_{t+1} = \dot{x}_t + \tau \cdot \ddot{x}_t$
* **New Angle:** $\theta_{t+1} = \theta_t + \tau \cdot \dot{\theta}_t$
* **New Angular Vel:** $\dot{\theta}_{t+1} = \dot{\theta}_t + \tau \cdot \ddot{\theta}_t$

---
*Note: The factor $\frac{4}{3}$ in the angular acceleration equation accounts for the moment of inertia of a uniform rod rotating about one end.*

In [4]:
import math

# 1. Define the Physics Constants (from CartPole-v1 source)
GRAVITY = 9.8
MASSCART = 1.0
MASSPOLE = 0.1
TOTAL_MASS = (MASSCART + MASSPOLE)
LENGTH = 0.5  # actually half the pole's length
POLEMASS_LENGTH = (MASSPOLE * LENGTH)
FORCE_MAG = 10.0
TAU = 0.02  # seconds between updates

def get_next_state(state, action):
    """
    Simulates the physics equations for one time step.
    Args:
        state: [x, x_dot, theta, theta_dot]
        action: 0 (Left) or 1 (Right)
    Returns:
        next_state: The predicted state after 0.02s
    """
    x, x_dot, theta, theta_dot = state
    
    # Convert discrete action to force direction
    force = FORCE_MAG if action == 1 else -FORCE_MAG
    
    # --- The Physics Equations (Equation of Motion) ---
    costheta = math.cos(theta)
    sintheta = math.sin(theta)

    # Temporary variable for the linear force calculation
    temp = (force + POLEMASS_LENGTH * theta_dot**2 * sintheta) / TOTAL_MASS
    
    # Calculate Angular Acceleration (theta_double_dot)
    thetaacc = (GRAVITY * sintheta - costheta * temp) / (LENGTH * (4.0/3.0 - MASSPOLE * costheta**2 / TOTAL_MASS))
    
    # Calculate Linear Acceleration (x_double_dot)
    xacc = temp - POLEMASS_LENGTH * thetaacc * costheta / TOTAL_MASS

    # --- Euler Integration (Update State) ---
    x_new = x + TAU * x_dot
    x_dot_new = x_dot + TAU * xacc
    theta_new = theta + TAU * theta_dot
    theta_dot_new = theta_dot + TAU * thetaacc

    return (x_new, x_dot_new, theta_new, theta_dot_new)

def calculate_cost(state):
    """
    Evaluates how 'bad' a state is. 
    Lower cost = Better state.
    """
    x, x_dot, theta, theta_dot = state
    
    # We want the pole upright (theta=0) and the cart in the center (x=0).
    # We give much higher weight to the angle because falling ends the game immediately.
    
    angle_penalty = abs(theta) * 10.0      # Heavy penalty for tipping
    velocity_penalty = abs(theta_dot) * 1.0 # Penalty for swinging fast
    position_penalty = abs(x) * 0.1        # Small penalty for drifting from center
    
    return angle_penalty + velocity_penalty + position_penalty

def choose_best_action(current_state):
    """
    Simulates both moves and picks the winner.
    """
    # 1. Simulate the outcome of moving LEFT (Action 0)
    next_state_left = get_next_state(current_state, 0)
    cost_left = calculate_cost(next_state_left)
    
    # 2. Simulate the outcome of moving RIGHT (Action 1)
    next_state_right = get_next_state(current_state, 1)
    cost_right = calculate_cost(next_state_right)
    
    # 3. Compare and decide
    if cost_left < cost_right:
        return 0 # Go Left
    else:
        return 1 # Go Right

In [6]:
import gymnasium as gym
# Create our training environment - a cart with a pole that needs balancing
env = gym.make("CartPole-v1", render_mode="human")

# Reset environment to start a new episode
# observation, info = env.reset()

state, _ = env.reset()
episode_over = False
total_reward = 0

while not episode_over:
    # Use our physics model to pick the move
    action = choose_best_action(state)
    
    # Step the real environment
    state, reward, terminated, truncated, info = env.step(action)

    # reward: +1 for each step the pole stays upright
    # terminated: True if pole falls too far (agent failed)
    # truncated: True if we hit the time limit (500 steps)

    total_reward += reward
    episode_over = terminated or truncated

print(f"Episode finished with total reward: {total_reward}")

Episode finished with total reward: 318.0


In [3]:
episode_over = False
total_reward = 0

while not episode_over:
    # Choose an action: 0 = push cart left, 1 = push cart right
    action = env.action_space.sample()  # Random action for now - real agents will be smarter!

    # Take the action and see what happens
    observation, reward, terminated, truncated, info = env.step(action)

    # reward: +1 for each step the pole stays upright
    # terminated: True if pole falls too far (agent failed)
    # truncated: True if we hit the time limit (500 steps)

    total_reward += reward
    episode_over = terminated or truncated

print(f"Episode finished! Total reward: {total_reward}")
env.close()

Episode finished! Total reward: 51.0
