
# The Adaptive Scientist: A Reinforcement Learning Proof-of-Concept



# 1. Environment Setup & CFD Baseline

In [6]:

# Install OpenFOAM using the official script.
# This may take several minutes.
print("Installing OpenFOAM and dependencies using the official script...")
!wget -q -O - https://dl.openfoam.org/add-debian-repo.sh | sudo bash > /dev/null 2>&1
!sudo apt-get -y update > /dev/null 2>&1
!sudo apt-get -y install openfoam9 > /dev/null 2>&1
print("OpenFOAM installation command finished.")


import os
import sys
import subprocess
import shutil

# Define the path to the OpenFOAM environment setup script.
openfoam_bash_path = "/opt/openfoam9/etc/bashrc"
if not os.path.exists(openfoam_bash_path):
    print("\nFATAL ERROR: OpenFOAM bashrc file not found at the expected path!")
    print("This likely means the OpenFOAM installation failed.")
    sys.exit("Halting script due to failed OpenFOAM setup.")
else:
    print(f"Using OpenFOAM bashrc at: {openfoam_bash_path}")

# --- Robust Path and Directory Management ---
content_path = '/content'
repo_path = os.path.join(content_path, 'PPO-PyTorch')

# Always start from the content directory
%cd {content_path}

# Clone PPO Repository
print("\nCloning PPO repository...")
if os.path.exists(repo_path):
    !rm -rf {repo_path}
!git clone -q https://github.com/nikhilbarhate99/PPO-PyTorch.git
print(f"Repository cloned to {repo_path}.")

# --- Create the CFD Case from Scratch ---
print("\nCreating self-contained 'cavity' CFD case...")
case_name = "cavity"
case_path = os.path.join(repo_path, case_name)
os.makedirs(os.path.join(case_path, 'system'), exist_ok=True)
os.makedirs(os.path.join(case_path, 'constant'), exist_ok=True)
os.makedirs(os.path.join(case_path, '0'), exist_ok=True)

# --- Define File Contents ---

blockMeshDict_content = """
FoamFile { format ascii; class dictionary; object blockMeshDict; }
convertToMeters 1;
vertices (
    (0 0 0) (1 0 0) (1 1 0) (0 1 0)
    (0 0 0.1) (1 0 0.1) (1 1 0.1) (0 1 0.1)
);
blocks ( hex (0 1 2 3 4 5 6 7) (20 20 1) simpleGrading (1 1 1) );
edges ();
boundary (
    movingWall { type wall; faces ((3 7 6 2)); }
    fixedWalls { type wall; faces ((0 4 7 3) (2 6 5 1) (1 5 4 0)); }
    frontAndBack { type empty; faces ((0 3 2 1) (4 5 6 7)); }
);
mergePatchPairs ();
"""

controlDict_content = """
FoamFile { format ascii; class dictionary; location "system"; object controlDict; }
application     simpleFoam;
startFrom       latestTime;
startTime       0;
stopAt          endTime;
endTime         1000;
deltaT          1;
writeControl    timeStep;
writeInterval   50;
purgeWrite      0;
writeFormat     ascii;
writePrecision  6;
writeCompression off;
timeFormat      general;
timePrecision   6;
runTimeModifiable true;
"""

fvSchemes_content = """
FoamFile { format ascii; class dictionary; location "system"; object fvSchemes; }
ddtSchemes { default steadyState; }
gradSchemes { default Gauss linear; }
divSchemes {
    default         none;
    div(phi,U)      Gauss upwind;
    div((nuEff*dev2(T(grad(U))))) Gauss linear;
}
laplacianSchemes { default Gauss linear orthogonal; }
interpolationSchemes { default linear; }
snGradSchemes { default orthogonal; }
"""

fvSolution_content = """
FoamFile { format ascii; class dictionary; location "system"; object fvSolution; }
solvers {
    p { solver PCG; preconditioner DIC; tolerance 1e-06; relTol 0.1; }
    U { solver PBiCGStab; preconditioner DILU; tolerance 1e-08; relTol 0; }
}
SIMPLE {
    nNonOrthogonalCorrectors 0;
    pRefCell 0;
    pRefValue 0;
}
relaxationFactors {
    fields { p 0.3; }
    equations { U 0.7; }
}
"""

transportProperties_content = """
FoamFile { format ascii; class dictionary; location "constant"; object transportProperties; }
transportModel Newtonian;
nu [0 2 -1 0 0 0 0] 0.01;
"""

momentumTransport_content = """
FoamFile { format ascii; class dictionary; location "constant"; object momentumTransport; }
simulationType laminar;
"""

U_content = """
FoamFile { format ascii; class volVectorField; object U; }
dimensions [0 1 -1 0 0 0 0];
internalField uniform (0 0 0);
boundaryField {
    movingWall { type fixedValue; value uniform (1 0 0); }
    fixedWalls { type noSlip; }
    frontAndBack { type empty; }
}
"""

p_content = """
FoamFile { format ascii; class volScalarField; object p; }
dimensions [0 2 -2 0 0 0 0];
internalField uniform 0;
boundaryField {
    movingWall { type zeroGradient; }
    fixedWalls { type zeroGradient; }
    frontAndBack { type empty; }
}
"""

# --- Write files ---
with open(os.path.join(case_path, "system", "blockMeshDict"), 'w') as f: f.write(blockMeshDict_content)
with open(os.path.join(case_path, "system", "controlDict"), 'w') as f: f.write(controlDict_content)
with open(os.path.join(case_path, "system", "fvSchemes"), 'w') as f: f.write(fvSchemes_content)
with open(os.path.join(case_path, "system", "fvSolution"), 'w') as f: f.write(fvSolution_content)
with open(os.path.join(case_path, "constant", "transportProperties"), 'w') as f: f.write(transportProperties_content)
with open(os.path.join(case_path, "constant", "momentumTransport"), 'w') as f: f.write(momentumTransport_content)
with open(os.path.join(case_path, "0", "U"), 'w') as f: f.write(U_content)
with open(os.path.join(case_path, "0", "p"), 'w') as f: f.write(p_content)

print("Case files created successfully.")


# --- Function to run OpenFOAM commands safely ---
def run_foam_command(command, log_filename):
    full_cmd = f"source {openfoam_bash_path} && {command}"
    try:
        with open(os.path.join(case_path, log_filename), 'w') as log_file:
            subprocess.run(full_cmd, shell=True, check=True, executable='/bin/bash', cwd=case_path, stdout=log_file, stderr=subprocess.STDOUT)
        print(f"{command} successful.")
    except subprocess.CalledProcessError:
        print(f"\nFATAL ERROR: {command} failed.")
        log_file_path = os.path.join(case_path, log_filename)
        if os.path.exists(log_file_path):
            with open(log_file_path, "r") as f:
                print(f"--- {log_filename} ---")
                print(f.read())
        sys.exit(f"Halting script due to {command} error.")

# --- Run Baseline Simulation ---
run_foam_command("blockMesh", "log.blockMesh")
run_foam_command("simpleFoam", "log.baseline")

print("Baseline simulation complete.")

# Establish Baseline
print("\n================================================================")
print("ACTION REQUIRED: Establish Baseline Performance")
print(f"From the file browser on the left, navigate to '{case_path}' and open the 'log.baseline' file.")
print("Scroll to the end and note the number of 'Time =' steps. This is the iteration count your RL agent will try to beat.")
print("================================================================\n")

# Change to the repo directory for the subsequent cells
%cd {repo_path}


Installing OpenFOAM and dependencies using the official script...
OpenFOAM installation command finished.
Using OpenFOAM bashrc at: /opt/openfoam9/etc/bashrc
/content

Cloning PPO repository...
Repository cloned to /content/PPO-PyTorch.

Creating self-contained 'cavity' CFD case...
Case files created successfully.
blockMesh successful.
simpleFoam successful.
Baseline simulation complete.

ACTION REQUIRED: Establish Baseline Performance
From the file browser on the left, navigate to '/content/PPO-PyTorch/cavity' and open the 'log.baseline' file.
Scroll to the end and note the number of 'Time =' steps. This is the iteration count your RL agent will try to beat.

/content/PPO-PyTorch


In [7]:
print(os.environ.get('HOME'))

/root


# 2. Dependencies and Custom Environment

In [8]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import re
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
import time
import glob # Import glob to find time directories

print("\nLibraries imported. Defining the custom CFD environment...")

class CFDEnv(gym.Env):
    def __init__(self, openfoam_bash):
        super(CFDEnv, self).__init__()
        self.openfoam_bash = openfoam_bash
        self.repo_path = '/content/PPO-PyTorch'
        self.base_case_path = os.path.join(self.repo_path, "cavity")

        # Unique instance path to avoid conflicts
        self.instance_id = f"instance_{time.time()}_{np.random.randint(1000)}"
        self.case_path = os.path.join("/tmp", self.instance_id)

        # File paths
        self.fv_solution_path = os.path.join(self.case_path, 'system', 'fvSolution')
        self.control_dict_path = os.path.join(self.case_path, 'system', 'controlDict')
        self.log_path = os.path.join(self.case_path, 'log.simpleFoam')

        self.action_space = spaces.Box(low=0.1, high=1.0, shape=(2,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(10,), dtype=np.float32)

        self.residual_history = np.zeros(10, dtype=np.float32)
        self.step_count = 0
        self.max_steps = 100 # Max iterations per episode

    def _get_latest_time(self):
        """Finds the latest time step directory in the case."""
        time_dirs = glob.glob(os.path.join(self.case_path, '[0-9]*'))
        if not time_dirs:
            return 0
        return max([int(os.path.basename(d)) for d in time_dirs if os.path.basename(d).isdigit()])

    def _set_control_dict(self, start_time, end_time):
        """Dynamically sets the start and end time in controlDict."""
        with open(self.control_dict_path, 'r') as f:
            content = f.read()
        content = re.sub(r'startFrom\s+\w+;', f'startFrom\tstartTime;', content)
        content = re.sub(r'startTime\s+[\d\.]+;', f'startTime\t{start_time};', content)
        content = re.sub(r'endTime\s+[\d\.]+;', f'endTime\t{end_time};', content)
        with open(self.control_dict_path, 'w') as f:
            f.write(content)

    def _run_openfoam_command(self, command):
        """Runs an OpenFOAM command."""
        full_cmd = f"source {self.openfoam_bash} && {command}"
        try:
            # Use a single log file, overwriting it for each command
            with open(self.log_path, 'w') as log_file:
                subprocess.run(
                    full_cmd, shell=True, check=True, cwd=self.case_path,
                    executable='/bin/bash', stdout=log_file, stderr=subprocess.STDOUT
                )
            return True
        except subprocess.CalledProcessError:
            return False

    def _parse_residuals(self):
        """Parses the final residuals from the log file."""
        residuals = {'Ux': None, 'p': None}
        try:
            with open(self.log_path, 'r') as f:
                log_content = f.read()
            # Find the last occurrence of 'Solving for' blocks
            u_match = re.findall(r'Solving for Ux.*Initial residual = ([\d.e-]+)', log_content)
            p_match = re.findall(r'Solving for p.*Initial residual = ([\d.e-]+)', log_content)
            if u_match: residuals['Ux'] = float(u_match[-1])
            if p_match: residuals['p'] = float(p_match[-1])
        except (FileNotFoundError, IndexError):
            return None, True
        if residuals['Ux'] is None or residuals['p'] is None or np.isnan(residuals['Ux']) or np.isnan(residuals['p']):
            return None, True # Diverged if residuals are bad
        return np.array([residuals['Ux'], residuals['p']]), False

    def _update_relaxation_factors(self, action):
        """Updates relaxation factors in fvSolution."""
        u_urf, p_urf = np.clip(action[0], 0.1, 1.0), np.clip(action[1], 0.1, 1.0)
        with open(self.fv_solution_path, 'r') as f:
            content = f.read()
        content = re.sub(r'(U\s+)[0-9\.]+', fr'\g<1>{u_urf:.3f}', content)
        content = re.sub(r'(p\s+)[0-9\.]+', fr'\g<1>{p_urf:.3f}', content)
        with open(self.fv_solution_path, 'w') as f:
            f.write(content)

    def step(self, action):
        self.step_count += 1
        self._update_relaxation_factors(action)

        # Get current time and set controlDict to run for ONE step
        current_time = self._get_latest_time()
        self._set_control_dict(start_time=current_time, end_time=current_time + 1)

        command_successful = self._run_openfoam_command('simpleFoam')
        new_residuals, diverged = (None, True) if not command_successful else self._parse_residuals()

        info, terminated, truncated = {}, False, False
        if diverged:
            reward = -200.0
            terminated = True
        else:
            # Reward is negative log of pressure residual (we want to minimize it)
            reward = -np.log10(new_residuals[1] + 1e-10)
            self.residual_history = np.roll(self.residual_history, 2)
            self.residual_history[0:2] = new_residuals

            # Check for convergence
            if new_residuals[1] < 1e-5:
                reward += 100.0 # Large bonus for converging
                terminated = True

        if self.step_count >= self.max_steps:
             truncated = True

        return self.residual_history, reward, terminated, truncated, info

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        self.residual_history.fill(0)

        # Create a unique, clean case directory for this episode
        if os.path.exists(self.case_path):
            shutil.rmtree(self.case_path)
        shutil.copytree(self.base_case_path, self.case_path)

        # We need a meshed case but don't run the solver here.
        # The first step will handle the first iteration.
        self._run_openfoam_command('blockMesh')

        # The initial state is a vector of zeros, as no simulation has run.
        info = {}
        return self.residual_history, info

    def close(self):
        if os.path.exists(self.case_path):
            shutil.rmtree(self.case_path)

print("CFDEnv class defined.")


Libraries imported. Defining the custom CFD environment...
CFDEnv class defined.


# 3. PPO Agent Implementation

In [9]:

class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.state_values = []
        self.is_terminals = []

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.state_values[:]
        del self.is_terminals[:]

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std_init):
        super(ActorCritic, self).__init__()
        self.action_dim = action_dim
        self.action_var = torch.full((action_dim,), action_std_init * action_std_init)

        # actor
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, action_dim),
            nn.Tanh() # Use Sigmoid to ensure output is between 0 and 1
        )
        # critic
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

    def set_action_std(self, new_action_std):
        self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std)

    def forward(self):
        raise NotImplementedError

    def act(self, state):
        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()
        # Clip action to be between 0 and 1
        action = torch.clamp(action, 0, 1)
        action_logprob = dist.log_prob(action)
        state_val = self.critic(state)
        return action.detach(), action_logprob.detach(), state_val.detach()

    def evaluate(self, state, action):
        action_mean = self.actor(state)
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var)
        dist = MultivariateNormal(action_mean, cov_mat)
        if self.action_dim == 1:
            action = action.reshape(-1, self.action_dim)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        return action_logprobs, state_values, dist_entropy

class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std_init=0.6):
        self.action_std = action_std_init
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.buffer = RolloutBuffer()
        self.policy = ActorCritic(state_dim, action_dim, action_std_init)
        self.optimizer = torch.optim.Adam([
            {'params': self.policy.actor.parameters(), 'lr': lr_actor},
            {'params': self.policy.critic.parameters(), 'lr': lr_critic}
        ])
        self.policy_old = ActorCritic(state_dim, action_dim, action_std_init)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.MseLoss = nn.MSELoss()

    def set_action_std(self, new_action_std):
        self.action_std = new_action_std
        self.policy.set_action_std(new_action_std)
        self.policy_old.set_action_std(new_action_std)

    def decay_action_std(self, action_std_decay_rate, min_action_std):
        self.action_std = self.action_std - action_std_decay_rate
        self.action_std = round(self.action_std, 4)
        if (self.action_std <= min_action_std):
            self.action_std = min_action_std
        self.set_action_std(self.action_std)



    def select_action(self, state):
        with torch.no_grad():
            state = torch.FloatTensor(state)
            action, action_logprob, state_val = self.policy_old.act(state)

        # The action from the policy is now in [-1, 1].
        # First, scale it to [0, 1]: (action + 1) / 2
        # Then, scale it to [0.1, 1.0] for the environment: ... * 0.9 + 0.1
        env_action = ((action + 1) / 2) * 0.9 + 0.1

        self.buffer.states.append(state)
        self.buffer.actions.append(action) # Store original action [-1, 1]
        self.buffer.logprobs.append(action_logprob)
        self.buffer.state_values.append(state_val)

        return env_action.numpy().flatten()



    def update(self):
        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach()
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach()
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach()
        old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach()

        # calculate advantages
        advantages = rewards.detach() - old_state_values.detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            state_values = torch.squeeze(state_values)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding the Surrogate Loss
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.buffer.clear()

    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)

    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))

print("PPO agent classes defined.")



PPO agent classes defined.


# CELL 4: Training the Agent

In [10]:

# Instantiate the custom environment and the PPO agent, then run the
# main training loop with parameters adjusted for the slow CFD environment.


print("\nStarting training setup...")

################## Hyperparameters ##################
# Using a very small training budget for this proof-of-concept.
# A full run might take over an hour.
max_training_timesteps = 5000   # REDUCED for quick POC
update_timestep = 100           # update policy every n timesteps
K_epochs = 20                   # update policy for K epochs
eps_clip = 0.2                  # clip parameter for PPO
gamma = 0.99                    # discount factor
lr_actor = 0.0001               # learning rate for actor
lr_critic = 0.001               # learning rate for critic
action_std = 0.1                # starting std for action distribution
#####################################################

# Create environment, passing the verified OpenFOAM path
env = CFDEnv(openfoam_bash=openfoam_bash_path)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# Create PPO agent
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std)

print(f"State Dimensions: {state_dim}")
print(f"Action Dimensions: {action_dim}")
print("PPO Agent created. Starting training loop...")
print("This will take a while, please be patient...")

# Training loop
start_time = time.time()
time_step = 0
i_episode = 0

while time_step <= max_training_timesteps:
    state, _ = env.reset()
    current_ep_reward = 0

    for t in range(1, env.max_steps + 2):
        # Select action with policy
        action = ppo_agent.select_action(state)
        state, reward, terminated, truncated, _ = env.step(action)

        # Saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(terminated or truncated)

        time_step += 1
        current_ep_reward += reward

        # update PPO agent
        if time_step % update_timestep == 0:
            ppo_agent.update()

        if terminated or truncated:
            break

    i_episode += 1
    # Only print episode summary every 10 episodes to reduce log spam
    if i_episode % 10 == 0:
        print(f"Episode: {i_episode}, Timesteps: {time_step}, Reward: {current_ep_reward:.2f}")

env.close()
print(f"\nTraining finished in {time.time() - start_time:.2f} seconds.")
ppo_agent.save("PPO_CFDEnv.pth")
print("Model saved as PPO_CFDEnv.pth")



Starting training setup...
State Dimensions: 10
Action Dimensions: 2
PPO Agent created. Starting training loop...
This will take a while, please be patient...
Episode: 10, Timesteps: 1000, Reward: 143.97
Episode: 20, Timesteps: 2000, Reward: 150.96
Episode: 30, Timesteps: 3000, Reward: 162.85
Episode: 40, Timesteps: 4000, Reward: 148.86
Episode: 50, Timesteps: 5000, Reward: 151.55

Training finished in 1554.48 seconds.
Model saved as PPO_CFDEnv.pth


# CELL 5: Evaluation

In [11]:

# Load the trained agent and run it for one full episode to observe its
# learned policy and compare its performance against the baseline.


print("\nStarting evaluation...")

# Create a new environment for evaluation
eval_env = CFDEnv(openfoam_bash=openfoam_bash_path)
# Load the trained agent
eval_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std)
try:
    eval_agent.load("PPO_CFDEnv.pth")
    print("Loaded trained model for evaluation.")
except FileNotFoundError:
    print("Could not find trained model 'PPO_CFDEnv.pth'. Evaluation will use an untrained agent.")


state, _ = eval_env.reset()
done = False
total_reward = 0
eval_step = 0

print("\n| Step |      Action (U, p)      |  Reward  | Converged/Done |")
print("|------|-------------------------|----------|----------------|")

while not done and eval_step < eval_env.max_steps:
    eval_step += 1
    # Use the deterministic action for evaluation by taking the mean
    state_tensor = torch.FloatTensor(state)
    with torch.no_grad():
        # Action is [0,1], scale it to [0.1, 1.0] for env
        action_raw = eval_agent.policy_old.actor(state_tensor).numpy()
        action = action_raw * 0.9 + 0.1

    state, reward, terminated, truncated, _ = eval_env.step(action)
    total_reward += reward
    done = terminated or truncated

    action_str = f"({action[0]:.3f}, {action[1]:.3f})"
    print(f"| {eval_step:<4} | {action_str:<23} | {reward:<8.2f} | {str(done):<14} |")

print("\n====================== Evaluation Summary ======================")
print(f"The agent took {eval_step} steps to solve the environment.")
print(f"Total reward accumulated: {total_reward:.2f}")
print("\nCompare the number of steps above with the baseline number you recorded from 'log.baseline'.")
print("Even a small improvement demonstrates a successful proof-of-concept!")
print("================================================================")

eval_env.close()




Starting evaluation...
Loaded trained model for evaluation.

| Step |      Action (U, p)      |  Reward  | Converged/Done |
|------|-------------------------|----------|----------------|
| 1    | (0.193, 0.270)          | 0.42     | False          |
| 2    | (0.182, 0.276)          | 0.40     | False          |
| 3    | (0.169, 0.276)          | 0.37     | False          |
| 4    | (0.132, 0.255)          | 0.30     | False          |
| 5    | (0.101, 0.253)          | 0.23     | False          |
| 6    | (0.067, 0.251)          | 0.23     | False          |
| 7    | (0.054, 0.246)          | 0.23     | False          |
| 8    | (0.040, 0.241)          | 0.23     | False          |
| 9    | (0.030, 0.240)          | 0.23     | False          |
| 10   | (0.025, 0.239)          | 0.23     | False          |
| 11   | (0.025, 0.239)          | 0.23     | False          |
| 12   | (0.025, 0.239)          | 0.23     | False          |
| 13   | (0.025, 0.239)          | 0.23     | False     