INITIALIZE & TRAIN:

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import torch
import torch.nn as nn
from floris import FlorisModel
from pettingzoo import ParallelEnv

# --- 1. MULTI-AGENT FLORIS ENVIRONMENT ---
class FlorisMultiAgentEnv(ParallelEnv):
    def __init__(self, config_path):
        super().__init__()
        # 1. Initialize the physics model
        self.fmodel = FlorisModel(config_path)
        D = 126.0
        self.x_layout = [0, 0, 6 * D, 6 * D]
        self.y_layout = [0, 3 * D, 0, 3 * D]
        self.fmodel.set(layout_x=self.x_layout, layout_y=self.y_layout)
        
        # 2. DEFINE AGENTS FIRST (The Fix)
        self.possible_agents = [f"turbine_{i}" for i in range(len(self.x_layout))]
        self.agents = self.possible_agents[:]
        
        # 3. Define Spaces
        obs_low = np.array([260.0, 5.0, 0.03], dtype=np.float32)
        obs_high = np.array([290.0, 15.0, 0.25], dtype=np.float32)
        obs_space = spaces.Box(low=obs_low, high=obs_high, dtype=np.float32)
        
        # Local view for Actor
        self.observation_spaces = {a: obs_space for a in self.possible_agents}
        
        # Global view for Critic (Concatenated observations of all 4 turbines)
        self.state_space = spaces.Box(
            low=np.tile(obs_low, len(self.possible_agents)),
            high=np.tile(obs_high, len(self.possible_agents)),
            dtype=np.float32
        )
        self.shared_observation_spaces = {a: self.state_space for a in self.possible_agents}
        
        self.action_spaces = {a: spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32) 
                             for a in self.possible_agents}
        
        self.max_steps = 100
        self.current_step = 0
        self.wind_state = np.array([275.0, 10.0, 0.06], dtype=np.float32)
    
    def state(self):
        # [3 params * 4 turbines]
        return np.tile(self.wind_state, len(self.possible_agents))

    def reset(self, seed=None, options=None):

        self.current_step = 0
        self.wind_state = np.array([275.0, 10.0, 0.06], dtype=np.float32)

        observations = {a: self.wind_state for a in self.possible_agents}

        infos = {
            a: {"state": self.state()}
            for a in self.possible_agents
        }

        return observations, infos

    def step(self, actions):
        self.current_step += 1
        # Random wind drift
        self.wind_state[0] += np.random.normal(0, 0.2)
        self.wind_state[1] += np.random.normal(0, 0.05)
        self.wind_state = np.clip(self.wind_state, [260, 5, 0.03], [290, 15, 0.25])
        
        # Apply yaws to FLORIS
        yaws = np.array([actions[a][0] for a in self.possible_agents]) * 25.0
        self.fmodel.set(wind_directions=[self.wind_state[0]], wind_speeds=[self.wind_state[1]], 
                        turbulence_intensities=[self.wind_state[2]], yaw_angles=np.array([yaws]))
        self.fmodel.run()
        
        # Global Reward: Sum of all turbine power (encourages coordination)
        reward = np.sum(self.fmodel.get_turbine_powers()) / 1e6
        rewards = {a: reward for a in self.possible_agents}
        
        terminated = self.current_step >= self.max_steps
        terminations = {a: terminated for a in self.possible_agents}
        truncations = {a: False for a in self.possible_agents}
        
        observations = {a: self.wind_state for a in self.possible_agents}

        # ✅ THIS IS THE CRITICAL FIX
        infos = {
            a: {"state": self.state()}
            for a in self.possible_agents
        }

        return observations, rewards, terminations, truncations, infos

# --- 2. MODEL DEFINITIONS ---
from skrl.models.torch import DeterministicMixin, GaussianMixin, Model

class Actor(GaussianMixin, Model):
    def __init__(self, observation_space, action_space, device, **kwargs):
        Model.__init__(self, observation_space, action_space, device, **kwargs)
        GaussianMixin.__init__(self, reduction="sum")
        
        self.net = nn.Sequential(
            nn.Linear(self.num_observations, 64), 
            nn.ReLU(),
            nn.Linear(64, 64), 
            nn.ReLU(),
            nn.Linear(64, self.num_actions)
        )
        self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))

    def compute(self, inputs, role):
        return self.net(inputs["states"]), self.log_std_parameter, {}

class Critic(DeterministicMixin, Model):
    def __init__(self, observation_space, action_space, device, **kwargs):
        Model.__init__(self, observation_space, action_space, device, **kwargs)
        DeterministicMixin.__init__(self)
        
        self.net = nn.Sequential(
            nn.Linear(self.num_observations, 256), 
            nn.ReLU(),
            nn.Linear(256, 128), 
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def compute(self, inputs, role):
        return self.net(inputs["states"]), {}

# --- 3. INITIALIZATION AND TRAINING ---
from skrl.envs.wrappers.torch import wrap_env
from skrl.memories.torch import RandomMemory
from skrl.multi_agents.torch.mappo import MAPPO, MAPPO_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer

# Env Setup
raw_env = FlorisMultiAgentEnv("data_generation/farm_types/gch.yaml")
env = wrap_env(raw_env, wrapper="pettingzoo")

# memory object 
memory = RandomMemory(memory_size=2000, num_envs=env.num_envs, device=env.device)

# wrapped in a dictionary for MAPPO
memories = {agent_name: memory for agent_name in env.possible_agents}

# Model Sharing (Corrected spaces)
shared_policy = Actor(env.observation_spaces["turbine_0"], env.action_spaces["turbine_0"], env.device)

# Ensure the Critic uses the SHARED space (the 12-element one)
shared_value = Critic(env.shared_observation_spaces["turbine_0"], env.action_spaces["turbine_0"], env.device)

models = {a: {"policy": shared_policy, "value": shared_value} for a in env.possible_agents}

# MAPPO Config Fix: Add 'state_shape' if using custom memory
cfg_agent = MAPPO_DEFAULT_CONFIG.copy()
cfg_agent["random_timesteps"] = 0 # Start learning immediately
cfg_agent["learning_rate"] = 5e-4
cfg_agent["state_preprocessor"] = None # Optional: helps with stability 

# 3. Pass the DICTIONARY to the agent
agent = MAPPO(
    possible_agents=env.possible_agents, 
    models=models, 
    memories=memories,
    cfg=cfg_agent, 
    observation_spaces=env.observation_spaces, 
    action_spaces=env.action_spaces, 
    device=env.device,
    shared_observation_spaces=env.shared_observation_spaces
)

# TRAIN
trainer = SequentialTrainer(
    env=env, 
    agents=agent, 
    cfg={"timesteps": 50000, 
         "headless": True,
         "disable_progressbar": False}
)
trainer.train()

[38;20m[skrl:INFO] Environment wrapper: Petting Zoo[0m


  3%|▎         | 1530/50000 [02:00<52:29, 15.39it/s]  

EVALUATE

In [None]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_mappo_vs_baseline(mappo_agent, env, n_episodes=20):
    results = []
    device = mappo_agent.device
    
    # Set model to evaluation mode (turns off noise/exploration)
    for model_dict in mappo_agent.models.values():
        model_dict["policy"].eval()
    
    print(f"Starting MAPPO Evaluation over {n_episodes} episodes...")

    for i in range(n_episodes):
        # 1. Reset Environment
        obs_dict, _ = env.reset()
        
        # 2. Greedy Baseline (All Yaw = 0)
        # Assuming your env wind_state is set during reset
        env.fmodel.set(yaw_angles=np.zeros((1, 4)))
        env.fmodel.run()
        base_power = np.sum(env.fmodel.get_turbine_powers()) / 1e3
        
        # 3. MAPPO Run
        # Convert numpy observations to torch tensors for skrl
        with torch.no_grad():
            torch_obs = {
                a: torch.as_tensor(obs, device=device, dtype=torch.float32).view(1, -1) 
                for a, obs in obs_dict.items()
            }
            
            # skrl MAPPO act() expects a dict of tensors
            # We use 'act' to get the actions based on current policy
            actions, _, _ = mappo_agent.act(torch_obs, timestep=0, timesteps=0)
            
            # Extract values from tensors back to numpy for FLORIS
            yaws = np.array([actions[f"turbine_{j}"].cpu().numpy().flatten() for j in range(4)]).flatten() * 25.0
        
        # 4. Run FLORIS with MAPPO yaws
        env.fmodel.set(yaw_angles=np.array([yaws]))
        env.fmodel.run()
        mappo_power = np.sum(env.fmodel.get_turbine_powers()) / 1e3
        
        results.append({
            "Episode": i,
            "Base_kW": base_power,
            "MAPPO_kW": mappo_power,
            "Gain_%": 100 * (mappo_power - base_power) / base_power
        })
        
    print("Evaluation Complete.")
    return pd.DataFrame(results)

def plot_mappo_performance(df_results):
    # Set the style for a clean, academic look
    sns.set_theme(style="whitegrid")
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # --- 1. Statistical Box Plot (The Gain Distribution) ---
    sns.boxplot(y=df_results["Gain_%"], ax=axes[0], color="#5da5da", width=0.4)
    sns.stripplot(y=df_results["Gain_%"], ax=axes[0], color="black", alpha=0.3)
    
    axes[0].axhline(0, color='red', linestyle='--', linewidth=1.2)
    axes[0].set_title("Distribution of Power Gains (%)", fontsize=12, fontweight='bold')
    axes[0].set_ylabel("Gain over Greedy Baseline (%)")

    # --- 2. Episode Comparison (Baseline vs MAPPO) ---
    axes[1].plot(df_results["Episode"], df_results["Base_kW"], label="Baseline (0° Yaw)", 
                 marker='o', linestyle='-', color='gray', alpha=0.6)
    axes[1].plot(df_results["Episode"], df_results["MAPPO_kW"], label="MAPPO Strategy", 
                 marker='s', linestyle='-', color='#ee6677')
    
    axes[1].set_title("Power Output Comparison", fontsize=12, fontweight='bold')
    axes[1].set_xlabel("Episode")
    axes[1].set_ylabel("Total Farm Power (kW)")
    axes[1].legend()

    plt.tight_layout()
    plt.show()

# --- EXECUTE ---
df_mappo_results = evaluate_mappo_vs_baseline(agent, env, n_episodes=20)
print(df_mappo_results["Gain_%"].describe())
plot_mappo_performance(df_mappo_results)

HYPERPARAMETER TUNING

In [None]:
# OPTUNA

import optuna
from skrl.memories.torch import RandomMemory
from skrl.multi_agents.torch.mappo import MAPPO, MAPPO_DEFAULT_CONFIG
from skrl.envs.wrappers.torch import wrap_env


def objective(trial):
    # 1. Hyperparameters to tune
    lr = trial.suggest_float("lr", 1e-4, 1e-3, log=True)
    entropy_scale = trial.suggest_float("entropy_scale", 0.001, 0.05)
    mini_batches = trial.suggest_categorical("mini_batches", [2, 4, 8])

    # 2. Env setup
    env = FlorisMultiAgentEnv("data_generation/farm_types/gch.yaml") # Ensure path is correct
    env = wrap_env(env, wrapper="pettingzoo")
    
    # 3. Memory & Models (Shared weights for all turbines)
    memory = RandomMemory(memory_size=1000, num_envs=env.num_envs, device=env.device)
    
    # We define the Actor and Critic classes as discussed previously
    models = {}
    for agent_name in env.possible_agents:
        models[agent_name] = {
            "policy": Actor(env.observation_spaces[agent_name], env.action_spaces[agent_name], env.device),
            "value": Critic(env.shared_observation_spaces[agent_name], env.action_spaces[agent_name], env.device)
        }

    # 4. Agent Config
    cfg = MAPPO_DEFAULT_CONFIG.copy()
    cfg["learning_rate"] = lr
    cfg["entropy_loss_scale"] = entropy_scale
    cfg["mini_batches"] = mini_batches
    cfg["experiment"]["write_interval"] = 0 # Faster tuning without logs

    agent = MAPPO(possible_agents=env.possible_agents, models=models, memory=memory, 
                  cfg=cfg, observation_spaces=env.observation_spaces, 
                  action_spaces=env.action_spaces, device=env.device,
                  shared_observation_spaces=env.shared_observation_spaces)

    # 5. Train and Return Result
    agent.train(timesteps=5000) # Short training for tuning
    # Evaluate for 1 episode
    total_reward = 0
    obs, _ = env.reset()
    for _ in range(100):
        with torch.no_grad():
            actions = agent.act(obs, timestep=0, timesteps=0)[0]
            obs, rewards, terminated, truncated, _ = env.step(actions)
            total_reward += sum(rewards.values()) / len(rewards)
            if any(terminated.values()): break
            
    return total_reward

# Run study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
print("Best Params:", study.best_params)