In [1]:
! pip install "gymnasium[mujoco]"



In [78]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import matplotlib.pyplot as plt

class RewardTracker:
    def __init__(self, window_size=100):
        """Tracks episode rewards and plots performance over time."""
        self.rewards = []  # Stores total rewards per episode
        self.timesteps = []  # Stores total timesteps per episode
        self.window_size = window_size  # Smoothing window size

    def update(self, total_reward, total_timesteps):
        """Log episode reward and the corresponding timestep."""
        self.rewards.append(total_reward)
        self.timesteps.append(total_timesteps)

    def plot(self, method_name="L-CLIP", ax=None):
        """Plot smoothed reward curve."""
        if len(self.rewards) < self.window_size:
            print("Not enough episodes for a smooth plot yet.")
            return

        # Compute moving average for smoothing
        smoothed_rewards = [
            sum(self.rewards[max(0, i - self.window_size):i]) / min(i, self.window_size)
            for i in range(1, len(self.rewards) + 1)
        ]

        if not ax:
            fig, ax = plt.subplots(figsize=(10, 5))
        ax.plot(self.timesteps, smoothed_rewards, label=method_name, color="b")
        ax.set_xlabel("Timesteps")
        ax.set_ylabel("Smoothed Reward")
        ax.set_title(f"PPO Training Performance ({method_name})")
        ax.legend()
        ax.grid(True)
        #plt.show()


In [117]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Hyperparameters
ENV_NAME = "HalfCheetah-v5"
GAMMA = 0.99
LAMBDA = 0.95
EPSILON = 0.2
BATCH_SIZE = 64
LR = 2e-4
EPOCHS = 10
T_HORIZON = 1000  # Steps per update

# Define Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, action_dim)
        )
        self.log_std = nn.Parameter(torch.zeros(action_dim))  # Learnable log std dev

    def forward(self, state):
        mean = self.fc(state)
        std = self.log_std.exp()
        return mean, std

# Define Value Network
class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        return self.fc(state).squeeze(-1)

# Compute GAE
def compute_advantages(rewards, values, dones):
    advantages = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + GAMMA * values[i + 1] * (1 - dones[i]) - values[i]
        gae = delta + GAMMA * LAMBDA * (1 - dones[i]) * gae
        advantages.insert(0, gae)
    returns = torch.tensor(advantages, dtype=torch.float32) + torch.tensor(values[:-1], dtype=torch.float32)
    return torch.tensor(advantages), returns

# PPO Loss Variants
def ppo_loss(states, actions, old_log_probs, advantages, returns, policy, value_net, variant="clipping"):
    means, stds = policy(states)
    dist = torch.distributions.Normal(means, stds)
    new_log_probs = dist.log_prob(actions).sum(dim=-1)
    ratio = torch.exp(new_log_probs - old_log_probs)

    # Different PPO Variants
    if variant == "no_clipping":
        loss_policy = ratio * advantages
    elif variant == "clipping":
        clipped_ratio = torch.clamp(ratio, 1 - EPSILON, 1 + EPSILON)
        loss_policy = torch.min(ratio * advantages, clipped_ratio * advantages)
    elif variant == "kl_penalty":
        kl_div = torch.distributions.kl.kl_divergence(dist, torch.distributions.Normal(means.detach(), stds.detach())).mean()
        loss_policy = ratio * advantages - 0.3 * kl_div  # Beta = 0.01

    #loss_value = (value_net(states) - returns) ** 2
    return -loss_policy.mean() #+ 0.5 * loss_value.mean()

In [118]:
# Training Loop

def run_experiments(methods, n_repeat=3):
    experiment_dict = {f'{method}':[] for method in methods} # Store the reward trackers accross all runs
    for method in methods:
        for _ in range(n_repeat):
            env = gym.make(ENV_NAME)
            state_dim = env.observation_space.shape[0]
            action_dim = env.action_space.shape[0]

            policy = PolicyNetwork(state_dim, action_dim)
            value_net = ValueNetwork(state_dim)
            tracker = RewardTracker(window_size=100)
            total_timesteps = 0
            optimizer = optim.Adam(list(policy.parameters()) + list(value_net.parameters()), lr=LR)

            for episode in range(1000):
                states, actions, rewards, values, log_probs, dones = [], [], [], [], [], []
                state, _ = env.reset()
                total_reward = 0
                
                for _ in range(T_HORIZON):
                    state_tensor = torch.tensor(state, dtype=torch.float32)
                    mean, std = policy(state_tensor)
                    dist = torch.distributions.Normal(mean, std)
                    action = dist.sample()
                    
                    next_state, reward, done, _, _ = env.step(action.numpy())
                    total_reward += reward
                    total_timesteps += 1
                    
                    states.append(state_tensor)
                    actions.append(action)
                    rewards.append(reward)
                    values.append(value_net(state_tensor).item())
                    log_probs.append(dist.log_prob(action).sum().item())
                    dones.append(done)

                    state = next_state
                    if done:
                        state, _ = env.reset()
                
                tracker.update(total_reward, total_timesteps)
                values.append(value_net(torch.tensor(state, dtype=torch.float32)).item())
                advantages, returns = compute_advantages(rewards, values, dones)
                
                states = torch.stack(states)
                actions = torch.stack(actions)
                log_probs = torch.tensor(log_probs)
                
                for _ in range(EPOCHS):
                    optimizer.zero_grad()
                    loss = ppo_loss(states, actions, log_probs, advantages, returns, policy, value_net, variant=method)
                    loss.backward()
                    optimizer.step()

                print(f"Episode {episode}: Loss={loss.item()}")
            experiment_dict[f'{method}'].append(tracker)
    return experiment_dict


In [119]:
experiment_dict_v3 = run_experiments(methods=['no_clipping', 'clipping', 'kl_penalty'])

Episode 0: Loss=9.607189025185653
Episode 1: Loss=7.903552477386125
Episode 2: Loss=11.050854392909072
Episode 3: Loss=11.417674995370955
Episode 4: Loss=12.132350919020856
Episode 5: Loss=11.22732836368946
Episode 6: Loss=11.756258667152872
Episode 7: Loss=9.821506251932512
Episode 8: Loss=10.49372169753298
Episode 9: Loss=10.131035340576043
Episode 10: Loss=11.039521202892072
Episode 11: Loss=10.378698087119774
Episode 12: Loss=10.399661000838877
Episode 13: Loss=10.863254292780343
Episode 14: Loss=7.463039900811101
Episode 15: Loss=10.359267475493455
Episode 16: Loss=10.388498560112678
Episode 17: Loss=9.454386108370684
Episode 18: Loss=9.944240293723302
Episode 19: Loss=11.057172217580403
Episode 20: Loss=9.83177076780931
Episode 21: Loss=10.438539348664522
Episode 22: Loss=9.966528087578425
Episode 23: Loss=13.167568203923663
Episode 24: Loss=10.11169198645259
Episode 25: Loss=10.5364898874986
Episode 26: Loss=8.89547631195638
Episode 27: Loss=9.652203287613707
Episode 28: Loss=11

In [None]:
# Get saved experiment dicts from other running environments 
# Each dictionnary correspond to runned methods with different hyperparameters
import pickle

experiment_dict_1_v3 = pickle.load(open('experiment_dict_1_v3.pkl', 'rb'))
experiment_dict_2_v3 = pickle.load(open('experiment_dict_2_v3.pkl', 'rb'))
experiment_dict_4_v3 = pickle.load(open('experiment_dict_4_v3.pkl', 'rb'))

In [163]:
def evaluate_random_policy(env_name="HalfCheetah-v5", num_episodes=10):
    env = gym.make(env_name)
    rewards = []
    
    for _ in range(num_episodes):
        obs, _ = env.reset()
        episode_reward = 0
        
        for _ in range(1000):  # Fixed episode length
            action = env.action_space.sample()
            obs, reward, terminated, truncated, _ = env.step(action)
            episode_reward += reward
            
            if truncated:  # This should be the stopping condition
                break  
        
        rewards.append(episode_reward)
    
    env.close()
    return np.mean(rewards), np.std(rewards)

mean_reward_rand, std_reward_rand = evaluate_random_policy()
print(f"Random Policy Performance: Mean Reward = {mean_reward_rand:.2f} ± {std_reward_rand:.2f}")

Random Policy Performance: Mean Reward = -296.45 ± 83.59


In [145]:
import pandas as pd

WINDOW_SIZE = 20
def moving_average(data, window_size=WINDOW_SIZE):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

def experiment_dict_to_pandas(experiment_dict):
    df = pd.DataFrame()
    timesteps = []
    rewards = []
    methods = []
    episodes = []
    for method in experiment_dict.keys():
        for rewardtracker in experiment_dict[method]:
            timesteps.extend(rewardtracker.timesteps)
            episodes.extend([i+1 for i in range(len(rewardtracker.timesteps))])
            rewards.extend(rewardtracker.rewards)
            methods.extend([method]*len(rewardtracker.rewards)) # Not the most efficient though
    df['episode'] = episodes
    df['timestep'] = timesteps
    df['reward'] = rewards
    df['method'] = methods
    return df

In [None]:
# Transform experiments dictionnaries into dataframes
experiments_df_0 = experiment_dict_to_pandas(experiment_dict_v3)
experiments_df_1 = experiment_dict_to_pandas(experiment_dict_1_v3)
experiments_df_2 = experiment_dict_to_pandas(experiment_dict_2_v3)
experiments_df_4 = experiment_dict_to_pandas(experiment_dict_4_v3)

## Precision over the sets of hyperparameters used
"""
df_0: (no_clipping, no_hyperparameters), (clipping, eps=0.2), (kl, beta=0.3)
df_1: (clipping, eps=0.1), (kl, beta=1.0)
df_2: (clipping, eps=0.3), (kl, beta=3.0)
df_4: (kl, beta=10.0)
"""

In [179]:
experiments_df_1.tail()

Unnamed: 0,episode,timestep,reward,method
5995,996,996000,213.365066,kl_penalty
5996,997,997000,156.306518,kl_penalty
5997,998,998000,285.995344,kl_penalty
5998,999,999000,284.800613,kl_penalty
5999,1000,1000000,-217.302484,kl_penalty


In [248]:
RANDOM_PERFORMANCE = mean_reward_rand
BEST_PERFORMANCE = max(df['reward'][-100:].max() for df in [experiments_df_0, experiments_df_1, experiments_df_2, experiments_df_4])

def get_normalized_score(score, random_policy_score=RANDOM_PERFORMANCE, best_policy_score=BEST_PERFORMANCE):
    return (score - random_policy_score)/(best_policy_score - random_policy_score)

In [249]:
# Get the Normalized score for the last 100 episodes from each run
print('df_0 Normalized scores:')
experiments_df_0.groupby(['method', 'episode']).mean()\
        .groupby('method')\
        .aggregate(lambda x: x.iloc[-100:]\
        .mean())['reward']\
        .apply(get_normalized_score)

df_0 Normalized scores:


method
clipping       0.445632
kl_penalty     0.687630
no_clipping    0.664889
Name: reward, dtype: float64

In [250]:
print('df_1 Normalized scores:')
experiments_df_1.groupby(['method', 'episode']).mean()\
        .groupby('method')\
        .aggregate(lambda x: x.iloc[-100:]\
        .mean())['reward']\
        .apply(get_normalized_score)

df_1 Normalized scores:


method
clipping     -0.427569
kl_penalty    0.626479
Name: reward, dtype: float64

In [251]:
print('df_2 Normalized scores:')
experiments_df_2.groupby(['method', 'episode']).mean()\
        .groupby('method')\
        .aggregate(lambda x: x.iloc[-100:]\
        .mean())['reward']\
        .apply(get_normalized_score)

df_2 Normalized scores:


method
clipping      0.894946
kl_penalty    0.750918
Name: reward, dtype: float64

In [252]:
print('df_4 Normalized scores:')
experiments_df_4.groupby(['method', 'episode']).mean()\
        .groupby('method')\
        .aggregate(lambda x: x.iloc[-100:]\
        .mean())['reward']\
        .apply(get_normalized_score)

df_4 Normalized scores:


method
kl_penalty    0.650396
Name: reward, dtype: float64