#The enviorment is the updated one, where neg reward is given first , also using SAC ? 


In [5]:

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
import matplotlib.pyplot as plt
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import SubprocVecEnv


In [6]:

class SDEEnv_train_3(gym.Env):
    """ stochasticity added ,also tuned the rewards more, and also has a fixed episode length"""
    def __init__(self):
        super(SDEEnv_train_3, self).__init__()
        # State is [y1, y2]
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(2,), dtype=np.float32)
        
        # Actions are [u1, u2], both in some control range
        self.action_space = spaces.Box(low=0, high=10, shape=(2,), dtype=np.float32)
        
        # Time step for numerical integration
        self.dt = 0.1
        
        # Initial values for state variables y1 and y2
        self.state = np.array([.1, .1])
        self.episode_length = 100000  # Maximum episode length
        self.current_step= 0
        
    def reset(self,seed = None,options = None):
        # Reset the state to initial values
        self.state = np.array([0.1, 0.1])
        self.current_step = 0
        return self.state

    def step(self, action):
        u1, u2 = action
        y1, y2 = self.state
        
        dt = self.dt
        
        # Deterministic part of the system (first equation)
        dy1 = ( -1*(u1 + 0.5 * u1**2 )* y1 + 0.5 * u2 * y2 / (y1 + y2) ) * dt
        
        # Stochastic part of the second equation
        dW = np.random.normal(0, np.sqrt(dt))  # Wiener process for stochastic term
        dy2 = (u1 * y1 - 0.7 * u2 * y1) * dt  #+ 0*(0.1 * np.sqrt(y1) ) * dW
        
        
        # Update states
        y1 += dy1
        y2 += dy2
        
        # Ensure non-negative concentrations
        #y1 = max(0, y1)
        #y2 = max(0, y2)
        
        self.state = np.array([y1, y2])
        
        # Reward is based on maximizing y2
        if y2<0.2:
            reward= -1 + y2*5
        else:
            reward = y2*5       
        # Done if the system has run too long or if values go out of bounds
        done = False
        
        if y1 < 0 or y2 < 0:
            reward = -100
            done = True
            
        self.current_step += 1
        
        if self.current_step >= self.episode_length:
            done = True
        
        #so terminate when exceeds episode length 
        
        return self.state, reward, done, False, {}

    def render(self):
        # Optional rendering for visualization, not essential
        print(f"State: y1={self.state[0]}, y2={self.state[1]}")


Training the model


In [7]:
env=SDEEnv_train_3()

#the model

In [8]:

Sac_model_v2 = SAC(
    "MlpPolicy", env, 
    learning_rate=3e-4,       # Start with default, reduce if needed
    buffer_size=int(1e6),          # Large buffer for off-policy learning
    batch_size=512,           # Larger batch for stable learning
    tau=0.005,                # Target smoothing
    gamma=0.90,               # Focus on long-term rewards
    train_freq=5,            # Update less frequently for stability
    gradient_steps=3,         # More steps to stabilize learning
    ent_coef='auto',          # Automatically adjust entropy
)



#Code for training

In [9]:

Sac_model_v2.learn(total_timesteps=1000000, log_interval=4, tb_log_name="sac",progress_bar=True)
Sac_model_v2.save("sac_mode_V3_NONSTPCHASTOCITY")
Sac_model_v2.save_replay_buffer("sac_replay_buffer_V3_NONSTPCHASTOCITY")
Sac_model_v2.learn(total_timesteps=500000, log_interval=4, tb_log_name="sac",progress_bar=True)
Sac_model_v2.save("sac_mode_V4_NONSTPCHASTOCITY")
Sac_model_v2.save_replay_buffer("sac_replay_buffer_V4_NONSTPCHASTOCITY")


Output()

Output()