In [46]:
#%pip install gymnasium
import gymnasium as gym
import numpy as np
from collections import defaultdict

In [33]:
env = gym.make("Pendulum-v1", render_mode="rgb_array", g=9.81)

In [34]:
def random_policy(env, state):
    return env.action_space.sample()

env = gym.make("Pendulum-v1", render_mode="rgb_array", g=9.81)

state = env.reset()[0]

for counter in range(201):

    #select the action according to the given policy
    action = random_policy(env, state)
    
    #perform the action and store the next state information
    next_state, reward, done, truncated, info = env.step(action)

    if counter % 10 == 0:
        print("Step:", counter, next_state, reward, done, info)
env.close()

Step: 0 [ 0.75242746 -0.65867513 -0.46360493] -0.48978954273343817 False {}
Step: 10 [-0.8847307  -0.46610254 -6.796938  ] -9.323188680536553 False {}
Step: 20 [ 0.6073396  0.7944423 -2.2150254] -2.0055754034538116 False {}
Step: 30 [-0.01677797  0.9998592   4.404816  ] -3.268810147956989 False {}
Step: 40 [ 0.01998746 -0.9998002   5.0815344 ] -6.339197514958356 False {}
Step: 50 [ 0.6868957 -0.726756  -1.3554863] -0.5943318816335034 False {}
Step: 60 [-0.99982435 -0.01874319 -7.224026  ] -12.198607384470899 False {}
Step: 70 [ 0.86809486  0.4963983  -2.5912557 ] -1.384128311746977 False {}
Step: 80 [ 0.9651835  -0.26157373 -1.7735304 ] -0.23289192199799527 False {}
Step: 90 [-0.7250146  -0.68873346 -6.7728553 ] -8.128964560944365 False {}
Step: 100 [ 0.6317218  0.7751952 -3.7648644] -2.8954824388937346 False {}
Step: 110 [ 0.980629   -0.19587427 -2.175134  ] -0.4168845445888698 False {}
Step: 120 [-0.81639427 -0.5774949  -7.526573  ] -9.412906453310034 False {}
Step: 130 [ 0.83704543 

In [35]:
observed_space = env.observation_space
print("The observation space:{}" .format(observed_space))

The observation space:Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)


In [55]:
action_space = env.action_space
print("The action space: {}".format(action_space))

The action space: Box(-2.0, 2.0, (1,), float32)


In [41]:
print(env.observation_space.high)
print(env.observation_space.low)

[1. 1. 8.]
[-1. -1. -8.]


In [48]:
num_episodes = 10000
gamma = 0.99
alpha = 0.1
epsilon = 0.1

In [62]:
obs_bins = [
    np.linspace(-1, 1, 11),
    np.linspace(-1, 1, 11),
    np.linspace(-8, 8, 11),
]
action_vals = np.linspace(-2.0, 2.0, 5)

def discretize_observation(obs):
    return tuple(np.digitize(obs[i], obs_bins[i]) for i in range(len(obs)))

Q = defaultdict(lambda: np.zeros(len(action_vals)))

In [63]:
returns = []
epsilon_decay = 0.999
epsilon_min = 0.1


for episode in range(num_episodes):
    obs, _ = env.reset()
    state = discretize_observation(obs)
    trajectory = []
    done = False

    while not done:
        if np.random.rand() < epsilon:
            action_idx = np.random.randint(len(action_vals))
        else:
            action_idx = np.argmax(Q[state])

        action = np.array([action_vals[action_idx]])
        next_obs, reward, terminated, truncated, _ = env.step(action)
        next_state = discretize_observation(next_obs)

        trajectory.append((state, action_idx, reward))
        state = next_state
        done = terminated or truncated

    G = 0
    visited = set()
    for t in reversed(range(len(trajectory))):
        s, a, r = trajectory[t]
        G = r + gamma * G
        if (s, a) not in visited:
            Q[s][a] += alpha * (G - Q[s][a])
            visited.add((s, a))

    episode_return = sum([x[2] for x in trajectory])
    returns.append(episode_return)

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    if episode % 1000 == 0:
        avg_ret = np.mean(returns[-500:])
        print(f"Episode {episode}, average return (last 1000): {avg_ret:.2f}")

Episode 0, average return (last 1000): -1052.67
Episode 1000, average return (last 1000): -1196.91
Episode 2000, average return (last 1000): -1035.58
Episode 3000, average return (last 1000): -1056.55
Episode 4000, average return (last 1000): -1106.23
Episode 5000, average return (last 1000): -1029.69
Episode 6000, average return (last 1000): -909.29
Episode 7000, average return (last 1000): -1111.96
Episode 8000, average return (last 1000): -1068.32
Episode 9000, average return (last 1000): -969.10


In [57]:
def evaluate_policy(n_episodes=1000000):
    rewards = []
    for _ in range(n_episodes):
        obs, _ = env.reset()
        state = discretize_observation(obs)
        done = False
        total_reward = 0
        while not done:
            action_idx = np.argmax(Q[state])
            action = np.array([action_vals[action_idx]])
            obs, reward, terminated, truncated, _ = env.step(action)
            state = discretize_observation(obs)
            total_reward += reward
            done = terminated or truncated
        rewards.append(total_reward)
    print(f"Average return over {n_episodes} eval episodes: {np.mean(rewards)}")
evaluate_policy()

KeyboardInterrupt: 