# Installations & Imports

In [None]:
!pip install deap
!pip install gymnasium
!pip install ray[rllib]

Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deap
Successfully installed deap-1.4.1
Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting ray[rllib]
  Downloading ray-2.9.3-cp310-cp310-manylinux2014_x86_64.whl (64.9 MB)

In [None]:
import os
import math
import random
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from typing import Any, Callable, Dict, List, Optional, Tuple
from deap import base, creator, tools, algorithms
import ray
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.ppo import PPOConfig
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

  if (distutils.version.LooseVersion(tf.__version__) <


# Classes & Helpers

## Data

In [70]:
class Data():
    def __init__(self,
                 x_min, x_max,
                 t_min, t_max,
                 test_dir,
                 eps=1e-5,
                 device='cpu',
                 dtype=torch.float32):

        self.x_min = x_min
        self.x_max = x_max
        self.t_min = t_min
        self.t_max = t_max
        self.test_dir = test_dir
        self.eps = eps
        self.device = device
        self.dtype = dtype


    def _generate_random_numbers(self, min_, max_, N):
        return min_ + (max_ - min_) * torch.rand(size=(N,), dtype=self.dtype)


    # *** Create in-domain points ***
    def sample_domain(self, N_domain, x_min, x_max, t_min, t_max):
        # Random Grid
        x_domain = self._generate_random_numbers(x_min, x_max, N_domain)
        t_domain = self._generate_random_numbers(t_min, t_max, N_domain)
        domain_data = torch.stack((x_domain, t_domain), dim=1)
        return torch.tensor(domain_data, dtype=self.dtype, device=self.device, requires_grad=True)


    # *** Boundary Conditions ***
    def sample_boundary(self, Nt_bound):
        # Random boundary points
        t_bound = self._generate_random_numbers(self.t_min, self.t_max, Nt_bound)
        x_left = - torch.ones(1, dtype=self.dtype)
        x_right = torch.ones(1, dtype=self.dtype)

        bound_data_left = torch.stack(torch.meshgrid(x_left, t_bound)).view(2, -1).permute(1, 0)
        bound_data_right = torch.stack(torch.meshgrid(x_right, t_bound)).view(2, -1).permute(1, 0)
        bound_data = torch.cat([bound_data_left, bound_data_right]).requires_grad_(True).to(self.device)

        u_bound = torch.zeros(len(bound_data), 1, dtype=self.dtype, device=self.device)

        return bound_data, u_bound


    # *** Initial Condition ***
    def sample_initial(self, Nx_init, x_min, x_max):
        # Random initial points
        x_init = self._generate_random_numbers(x_min, x_max, Nx_init)
        t_init = torch.zeros(1, dtype=self.dtype)
        init_data = torch.stack(torch.meshgrid(x_init, t_init)).view(2, -1).permute(1, 0).requires_grad_(True).to(self.device)

        u_init = - torch.sin(math.pi * x_init)

        return init_data, u_init

    # *** Test set ***
    def sample_test(self):
        test_data = pd.read_csv(self.test_dir).to_numpy()
        return torch.tensor(test_data, dtype=self.dtype, device=self.device, requires_grad=True)

## Network

In [71]:
class MLP(nn.Module):
    def __init__(self, layers, activation=nn.Tanh(), weight_init=None, bias_init=None, device='cpu'):
        super().__init__()
        self.n_layers = len(layers) - 1
        self.layers = layers
        self.activation = activation
        self.weight_init = weight_init
        self.bias_init = bias_init

        dense_layers = [
            self.dense_layer(in_features=self.layers[i], out_features=self.layers[i + 1])
            for i in range(self.n_layers - 1)]
        dense_layers.append(nn.Linear(in_features=self.layers[-2], out_features=self.layers[-1]))

        self.mlp = nn.Sequential(*dense_layers).to(device)

    def dense_layer(self, in_features, out_features):
        dense_layer = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=out_features),
        )

        if self.weight_init is not None:
            self.weight_init(dense_layer[0].weight)

        if self.bias_init is not None:
            self.bias_init(dense_layer[0].bias)

        dense_layer.add_module("activation", self.activation)
        return dense_layer

    def forward(self, x):
        return self.mlp(x)

## PINN-Base

In [95]:
class PINNBase():
    def __init__(self,
                 layers,
                 activation,
                 device):

        self.v = 0.01 / math.pi

        # Define the model
        self.model = MLP(layers=layers,
                         activation=activation,
                         weight_init=lambda m: nn.init.xavier_normal_(m.data, nn.init.calculate_gain('tanh')),
                         bias_init=lambda m: nn.init.zeros_(m.data),
                         device=device)

        # Set the optimizers
        adam = torch.optim.Adam(self.model.parameters())
        lbfgs = torch.optim.LBFGS(self.model.parameters(),
                                  lr=1,
                                  max_iter=5,
                                #   max_iter=5000,
                                  max_eval=None,
                                  tolerance_grad=1e-07,
                                  tolerance_change=1e-09,
                                  history_size=100,
                                  line_search_fn='strong_wolfe')

        self.optimizers = {"adam": adam, "lbfgs": lbfgs}

        # Set the Loss function
        self.criterion = nn.MSELoss()

        # Set the MAE criterion for test data only
        self.l1_loss = nn.L1Loss()


    def forward(self, x):
        return self.model(x)


    def grad(self, output, input):
        return torch.autograd.grad(
                    output, input,
                    grad_outputs=torch.ones_like(output),
                    retain_graph=True,
                    create_graph=True
                )[0]


    def calculate_pde_residual(self, x):
        # Forward pass
        u = self.forward(x)

        # Calculate 1st and 2nd derivatives
        du_dX = self.grad(u, x)
        du_dXX = self.grad(du_dX, x)

        # Retrieve the partial gradients
        du_dt = du_dX[:, 1].flatten()
        du_dx = du_dX[:, 0].flatten()
        du_dxx = du_dXX[:, 0].flatten()

        pde_res = du_dt + u.flatten() * du_dx - self.v * du_dxx

        return u, pde_res


    def calculate_pde_loss(self, data):
        # Calculate the domain loss
        _, self.pde_res = self.calculate_pde_residual(data)
        pde_target = torch.zeros_like(self.pde_res)
        return self.criterion(self.pde_res, pde_target)


    def calculate_total_loss(self, data):
        # Calculate boundary loss
        loss_b = self.criterion(
            self.forward(data["bound_data"]).flatten(),
            data["u_bound"].flatten()
        )

        # Calculate initial loss
        loss_i = self.criterion(
            self.forward(data["init_data"]).flatten(),
            data["u_init"].flatten()
        )

        # Calculate the domain loss
        domain_data = torch.cat((data["domain_data"], data["anchors"]), dim=0)
        loss_pde = self.calculate_pde_loss(domain_data)

        # Calculate total discriminator loss
        return loss_b + loss_i + loss_pde


    def evaluate_pinn(self, test_data):
        _, pde_res = self.calculate_pde_residual(test_data)
        pde_target = torch.zeros_like(pde_res)
        return self.l1_loss(pde_res, pde_target)


    def train_step(self, data):
        loss = self.calculate_total_loss(data)
        loss.backward()
        return loss


    def closure(self):
        self.lbfgs_optimizer.zero_grad()
        return self.train_step(self.data)

## PINN

In [103]:
class PINN():
    def __init__(self,
                 x_min, x_max,
                 t_min, t_max,
                 N_domain,
                 Nx_init,
                 Nt_bound,
                 test_dir,
                 general_max_episode_steps,
                 layers, activation,
                 checkpoint_path,
                 eps=1e-5,
                 device='cpu',
                 dtype=torch.float32):

        # Constants
        self.checkpoint_path = checkpoint_path
        self.device = device
        self.dtype = dtype
        self.x_min = x_min
        self.x_max = x_max
        self.t_min = t_min
        self.t_max = t_max
        self.general_max_episode_steps = general_max_episode_steps
        self.data = {}

        # Create real data
        self.data_init = Data(x_min, x_max,
                              t_min, t_max,
                              test_dir,
                              eps,
                              device,
                              dtype)

        # Create train data
        self.data["domain_data"] = self.data_init.sample_domain(N_domain, self.x_min, self.x_max, self.t_min, self.t_max)
        self.data["bound_data"], self.data["u_bound"] = self.data_init.sample_boundary(Nt_bound)
        self.data["init_data"], self.data["u_init"] = self.data_init.sample_initial(Nx_init, self.x_min, self.x_max)

        # Create test data
        self.test_data = self.data_init.sample_test()

        # Create base PINN
        self.base_pinn = PINNBase(layers, activation, device)


    def add_anchors(self, step, point):
        if step == 0:
            self.data["anchors"] = torch.empty(self.general_max_episode_steps, 2)
        else:
            new_anchors = torch.tensor(point, dtype=self.dtype, device=self.device, requires_grad=True).view(-1, 2)
            self.data["anchors"] = torch.cat((self.data["anchors"], new_anchors), dim=0)


    def train_with_adam(self, N_adam, data):
        optimizer = self.base_pinn.optimizers['adam']

        for epoch in range(1, N_adam + 1):
            optimizer.zero_grad()
            loss = self.base_pinn.train_step(data)
            optimizer.step()

        return loss


    def train_with_lbfgs(self, N_lbfgs, data):
        self.base_pinn.lbfgs_optimizer = self.base_pinn.optimizers["lbfgs"]
        self.base_pinn.data = data

        for epoch in range(1, N_lbfgs + 1):
            loss = self.base_pinn.lbfgs_optimizer.step(self.base_pinn.closure)

        return loss


    def checkpoint(self):
        torch.save({
            "model": self.base_pinn.model.state_dict()
        }, self.checkpoint_path)


    def format_loss(self, loss):
        if loss == 0:
            return "0.0e+00"

        # Calculate the exponent part
        exponent = int(math.log10(abs(loss)))

        # Determine the format based on the value of the loss
        if abs(loss) < 1:
            formatted_loss = f"{loss:.2e}"
        else:
            # Adjust the sign of the formatted loss
            sign = "-" if loss < 0 else ""

            # Calculate the number of decimal places
            decimal_places = 2 - exponent

            # Ensure at least two decimal places
            decimal_places = max(decimal_places, 2)

            # Format the loss with the correct sign
            formatted_loss = f"{sign}{abs(loss):.{decimal_places}e}"

        return formatted_loss


    def keep_checkpoints_and_print_losses(self, iter, patience, print_every, loss, loss_test):

        loss_str = self.format_loss(loss)
        loss_test_str = self.format_loss(loss_test)

        if iter == 1:
            self.best_val_loss = loss_test
            self.best_epoch = -1
            self.checkpoint()
            self.flag = 1
            print(f"Iteration: {iter} | loss: {loss_str} | test_mae: {loss_test_str} - *Checkpoint*")
        else:
            if loss_test < self.best_val_loss:
                self.best_val_loss = loss_test
                self.best_epoch = iter
                self.checkpoint()
                self.flag = 1
                if iter % print_every == 0:
                    print(f"Iteration: {iter} | loss: {loss_str} | test_mae: {loss_test_str} - *Checkpoint*")
            elif iter - self.best_epoch > patience:
                if iter % print_every == 0:
                    self.early_stopping_applied = 1
                    print(f"Iteration: {iter} | loss: {loss_str} | test_mae: {loss_test_str}")
                return

        if (self.flag == 0) and (iter % print_every == 0):
            print(f"Iteration: {iter} | loss: {loss_str} | test_mae: {loss_test_str}")


    def train(self, iters, patience, print_every, N_adam, N_lbfgs):
        print(f"PINN: {iters} iterations")
        print(f"a. PINN: {N_adam} epochs --> Adam")
        print(f"b. PINN: {N_lbfgs} epochs --> L-BFGS")

        for iter in tqdm(range(1, iters + 1)):
            self.flag = 0
            self.early_stopping_applied = 0

            # Train with adam
            print(f"\nTraining with ADAM...")
            loss = self.train_with_adam(N_adam, self.data)

            # Train with L-BFGS
            print(f"\nTraining with L-BFGS...")
            loss = self.train_with_lbfgs(N_lbfgs, self.data)

            # Evaluate on test
            loss_test = self.base_pinn.evaluate_pinn(self.test_data)

            # Keep check points and print losses
            self.keep_checkpoints_and_print_losses(iter, patience, print_every, loss, loss_test)
            if self.early_stopping_applied:
                print(f"\nEarly stopping applied at epoch {iter}.")
                break

        return loss_test.detach().cpu().numpy()

## Environment

In [104]:
class GeneralEnv(gym.Env):
    def __init__(self, env_config: Optional[Dict] = None):
        super().__init__()

        # PINN
        self.pinn = env_config["pinn"]
        self.iterations = env_config["iterations"]
        self.patience = env_config["patience"]
        self.print_every = env_config["print_every"]
        self.num_epochs_adam = env_config["num_epochs_adam"]
        self.num_epochs_lbfgs = env_config["num_epochs_lbfgs"]

        # PPO Output: One 2D Point
        self.action_space = gym.spaces.Box(
            low=np.float32([x_min, t_min]),
            high=np.float32([x_max, t_max]),
            dtype=np.float32,
            shape=(2,)
        )

        # PPO Input: Generated 2D Point, u, pde, step
        self.observation_space = gym.spaces.Box(
            low=np.float32([x_min, t_min, -np.inf, -np.inf, 0.0]),
            high=np.float32([x_max, t_max, np.inf, np.inf, 1.0]),
            dtype=np.float32,
            shape=(5,)
        )

        # Agent
        self.max_anchors = env_config['max_anchors']
        self.sampled_points = []
        self._step_counter = 0
        self._trial = -1
        self._eval_error = None

        # Initialize GENERAL history log
        self._general_history_fp = 'general_error_history.csv'
        self._initialize_log_file()


    def _initialize_log_file(self):
        with open(file=self._general_history_fp, mode='w', newline='\n') as file:
            s = 'trial'
            for i in range(self.max_anchors):
                s += f',x_{i + 1},t_{i + 1}'
            log = f'{s}\n'
            file.write(log)


    def _store_trial_to_log(self):
        num_sampled = len(self.sampled_points)
        assert self._eval_error is not None and num_sampled > 0
        assert num_sampled == self.max_anchors

        with open(file=self._general_history_fp, mode='a', newline='\n') as file:
            s = f'{self._trial}'
            for point in self.sampled_points:
                s += f',{point[0]},{point[1]}'
            log = f'{s}\n'
            file.write(log)


    def _construct_observation(self, action: Optional[np.ndarray]=None):
        # If action is None, then it randomly samples a 2D point
        if action is None:
            x_rand = np.random.uniform(low=x_min, high=x_max, size=(1, 1))
            t_rand = np.random.uniform(low=t_min, high=t_max, size=(1, 1))
            action = np.hstack((x_rand, t_rand))
        else:
            assert action.shape == (2,), f'Action should be 1D array with 2 values, got {action.shape}'

            action = np.float32([action])

        assert action.shape == (1, 2), f'Action should be 2D array with one 2D point, got {action.shape}'

        action_torch = torch.tensor(action, dtype=self.pinn.dtype, device=self.pinn.device, requires_grad=True)
        u, pde_res = self.pinn.base_pinn.calculate_pde_residual(x=action_torch)
        u = u.detach().cpu().numpy()
        pde_res = np.expand_dims(pde_res.detach().cpu().numpy(), axis=-1)
        normalized_step = np.float32([[self._step_counter/self.max_anchors]])

        assert u.shape == pde_res.shape == (1, 1), f'PINN output is expected to be (1,1) for a single 2D point, got {u.shape} and {pde_res.shape}'

        new_obs = np.squeeze(np.hstack((action, u, pde_res, normalized_step)))

        return np.squeeze(new_obs)


    def reset(
            self,
            *,
            seed: int | None = None,
            options: dict[str, Any] | None = None,
    ) -> Tuple[np.ndarray, dict[str, Any]]:

        # Compute PINN error & Store it to history log file
        self._step_counter = 0
        self._trial += 1

        # Initialize an empty anchors tensor
        self.pinn.add_anchors(step=self._step_counter, point=None)
        self.sampled_points = []

        return self._construct_observation(action=None)


    def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict[str, Any]]:
        self._step_counter += 1

        # Add sampled points to training data
        self.pinn.add_anchors(self._step_counter, action)
        self.sampled_points.append(action)

        # Generating next input
        next_obs = self._construct_observation(action=action)

        # If batch is completed Then train, reward the agent and restart episode (sampling process). Finally store log
        # Else continue episode (sampling process) and no reward is provided
        if self._step_counter < self.max_anchors:
            reward = 0.0
            done = False
        else:
            self._eval_error = self.pinn.train(self.iterations, self.patience, self.print_every, self.num_epochs_adam, self.num_epochs_lbfgs)
            self._store_trial_to_log()
            reward = float(self._eval_error)
            done = True

        # Return transition tuple: next observation, reward, done, truncate=False, info=None
        return next_obs, reward, done, {}


    def render(self):
        raise NotImplementedError('Render function is not supported')

# Configuration

In [105]:
# Data
x_min, x_max = -1, 1
t_min, t_max = 0, 1
N_domain = 2_500        # Number of domain training points
Nt_bound = 20           # Number of training points for x=-1 and x=1
Nx_init = 10            # Number of training points for t=0
test_dir = 'test_data.csv'

# Model
N_layers = 3
N_neurons = 20
layers = [2] + N_layers * [N_neurons] + [1]
hidden_activation = nn.Tanh()

# Other
dtype = torch.float32
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Training
iterations = 2
patience = iterations
print_every = 1
num_epochs_adam = 3
num_epochs_lbfgs = 1

In [None]:
trials = 10

# GENERAL Config
ray.shutdown()
ray.init()
general_batch_mode = 'complete_episodes'
general_episode_steps = 16
general_train_epochs = 10
general_critic = True
general_gae = True
general_lambda = 0.95
general_gamma = 0.99
general_sgd_minibatch_size = general_episode_steps
general_train_batch_size = general_episode_steps
general_shuffle_sequences = True
general_clip_param = 0.2
general_vf_loss_coeff = 0.5
general_learning_rate = 0.0005

# Checkpoint Filepaths
fcnet_directory = 'burger/checkpoints/fcnet'
baseline_directory = 'burger/checkpoints/models/baseline'
random_resampling_direcotry = 'burger/checkpoints/models/random-resampling'
rar_directory = 'burger/checkpoints/models/rar'
ms_rar_directory = 'burger/checkpoints/models/ms-rar'
genesis_directory = 'burger/checkpoints/models/genesis'
epsilon_greedy_directory = 'burger/checkpoints/models/epsilon-greedy'
ganpoint_directory = 'burger/checkpoints/models/ganpoint'
general_directory = f'burger/checkpoints/models/general'
general_rl_directory = f'{general_directory}/agent'

# Set seeds
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

2024-03-20 00:29:33,637	INFO worker.py:1724 -- Started a local Ray instance.


<torch._C.Generator at 0x7c29acf8b190>

# MAIN

In [106]:
# Initialize a PINN object based on the above configuration
pinn = PINN(
    x_min, x_max,
    t_min, t_max,
    N_domain,
    Nx_init,
    Nt_bound,
    test_dir,
    general_episode_steps,
    layers,
    hidden_activation,
    "pinn_model.pth"    # general_directory
)

agent_config = PPOConfig()
agent_config.model.update({
    'use_lstm': True,
    'vf_share_layers': True,
    'max_seq_len': general_episode_steps,
    'lstm_cell_size': 128,
})
agent_config.rollouts(
    num_rollout_workers=1,
    batch_mode=general_batch_mode,
    rollout_fragment_length=general_episode_steps
)
agent_config.use_critic = general_critic
agent_config.use_gae = general_gae
agent_config.clip_param = general_clip_param
agent_config.sgd_minibatch_size = general_sgd_minibatch_size
agent_config.shuffle_sequences = general_shuffle_sequences
agent_config.train_batch_size = general_episode_steps
agent_config.vf_loss_coeff = general_vf_loss_coeff
agent_config.seed = seed
agent_config.gamma = general_gamma
agent_config.lr = general_learning_rate
agent_config.num_gpus = 1
agent_config.environment(disable_env_checking=True)
agent_config.framework('torch')

env_config = {
    'pinn': pinn,
    'max_anchors': general_episode_steps,
    'iterations': iterations,
    'patience': patience,
    'print_every': print_every,
    'num_epochs_adam': num_epochs_adam,
    'num_epochs_lbfgs': num_epochs_lbfgs,
    'device': device,
}

agent = agent_config.environment(env=GeneralEnv, env_config=env_config).build()

for trial in range(trials):
    print(f"\n\n******************** trial = {trial} ********************")
    agent.train()

[36m(pid=20995)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[36m(pid=20995)[0m   declare_namespace(pkg)
[36m(pid=20995)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[36m(pid=20995)[0m   declare_namespace(pkg)
[36m(pid=20995)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
[36m(pid=20995)[0m   declare_namespace(pkg)
[36m(pid=20995)[0m Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa



******************** trial = 0 ********************
[36m(RolloutWorker pid=20995)[0m PINN: 2 iterations
[36m(RolloutWorker pid=20995)[0m a. PINN: 3 epochs --> Adam
[36m(RolloutWorker pid=20995)[0m b. PINN: 1 epochs --> L-BFGS
[36m(RolloutWorker pid=20995)[0m 
[36m(RolloutWorker pid=20995)[0m Training with ADAM...


RayTaskError(AttributeError): [36mray::RolloutWorker.apply()[39m (pid=20995, ip=172.28.0.12, actor_id=81d79b49fb28c0d2420d252c01000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7978486ea8c0>)
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/utils/actor_manager.py", line 189, in apply
    raise e
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/utils/actor_manager.py", line 178, in apply
    return func(self, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/execution/rollout_ops.py", line 84, in <lambda>
    lambda w: w.sample(), local_worker=False, healthy_only=True
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/evaluation/rollout_worker.py", line 694, in sample
    batches = [self.input_reader.next()]
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/evaluation/sampler.py", line 91, in next
    batches = [self.get_data()]
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/evaluation/sampler.py", line 276, in get_data
    item = next(self._env_runner)
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 344, in run
    outputs = self.step()
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 400, in step
    self._base_env.send_actions(actions_to_send)
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/env/vector_env.py", line 464, in send_actions
    ) = self.vector_env.vector_step(action_vector)
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/env/vector_env.py", line 360, in vector_step
    raise e
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/env/vector_env.py", line 353, in vector_step
    results = self.envs[i].step(actions[i])
  File "/usr/local/lib/python3.10/dist-packages/gymnasium/wrappers/compatibility.py", line 111, in step
    obs, reward, done, info = self.env.step(action)
  File "<ipython-input-104-4417b20f4c5a>", line 123, in step
  File "<ipython-input-103-77a156cb2eaf>", line 152, in train
  File "<ipython-input-103-77a156cb2eaf>", line 71, in train_with_lbfgs
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py", line 385, in wrapper
    out = func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/optim/lbfgs.py", line 309, in step
    state = self.state[self._params[0]]
AttributeError: 'LBFGS' object has no attribute '_params'

[36m(RolloutWorker pid=20995)[0m 
[36m(RolloutWorker pid=20995)[0m Training with L-BFGS...


[36m(RolloutWorker pid=20995)[0m   0%|          | 0/2 [00:00<?, ?it/s]
