In [6]:

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
import matplotlib.pyplot as plt
from sb3_contrib import  RecurrentPPO
from stable_baselines3.common.vec_env import SubprocVecEnv


In [2]:
#!pip install sb3-contrib

Collecting sb3-contrib
  Obtaining dependency information for sb3-contrib from https://files.pythonhosted.org/packages/40/60/58163d23588509c8c5415f5ad15164dee08e390f2e96802b8d8828f3904e/sb3_contrib-2.3.0-py3-none-any.whl.metadata
  Downloading sb3_contrib-2.3.0-py3-none-any.whl.metadata (3.6 kB)
Collecting stable-baselines3<3.0,>=2.3.0 (from sb3-contrib)
  Obtaining dependency information for stable-baselines3<3.0,>=2.3.0 from https://files.pythonhosted.org/packages/06/6a/c3098a78a63b5a48e18c11d80b8c532f8b7785d6abb1329cfe3034572161/stable_baselines3-2.3.2-py3-none-any.whl.metadata
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Downloading sb3_contrib-2.3.0-py3-none-any.whl (80 kB)
   ---------------------------------------- 0.0/80.3 kB ? eta -:--:--
   ----- ---------------------------------- 10.2/80.3 kB ? eta -:--:--
   -------------------- ------------------- 41.0/80.3 kB 653.6 kB/s eta 0:00:01
   ---------------------------------------- 80.3/80.3 kB 898.3 


[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: C:\Users\defaultuser0.LAPTOP-LRB3T941\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [7]:

class SDEEnv_train_3(gym.Env):
    """ stochasticity added ,also tuned the rewards more, and also has a fixed episode length"""
    def __init__(self):
        super(SDEEnv_train_3, self).__init__()
        # State is [y1, y2]
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(2,), dtype=np.float32)
        
        # Actions are [u1, u2], both in some control range
        self.action_space = spaces.Box(low=0, high=10, shape=(2,), dtype=np.float32)
        
        # Time step for numerical integration
        self.dt = 0.1
        
        # Initial values for state variables y1 and y2
        self.state = np.array([.1, .1])
        self.episode_length = 100000  # Maximum episode length
        self.current_step= 0
        
    def reset(self,seed = None,options = None):
        # Reset the state to initial values
        self.state = np.array([0.1, 0.1])
        self.current_step = 0
        return self.state

    def step(self, action):
        u1, u2 = action
        y1, y2 = self.state
        
        dt = self.dt
        
        # Deterministic part of the system (first equation)
        dy1 = ( -1*(u1 + 0.5 * u1**2 )* y1 + 0.5 * u2 * y2 / (y1 + y2) ) * dt
        
        # Stochastic part of the second equation
        dW = np.random.normal(0, np.sqrt(dt))  # Wiener process for stochastic term
        dy2 = (u1 * y1 - 0.7 * u2 * y1) * dt  #+ 0*(0.1 * np.sqrt(y1) ) * dW
        
        
        # Update states
        y1 += dy1
        y2 += dy2
        
        # Ensure non-negative concentrations
        #y1 = max(0, y1)
        #y2 = max(0, y2)
        
        self.state = np.array([y1, y2])
        
        # Reward is based on maximizing y2
        # if y2<0.2:
        #     reward= -1 + y2*5
        # else:
        #     reward = y2*5       
        reward = y2
        # Done if the system has run too long or if values go out of bounds
        done = False
        
        if y1 < 0 or y2 < 0:
            reward = -100
            done = True
            
        self.current_step += 1
        
        if self.current_step >= self.episode_length:
            done = True
        
        #so terminate when exceeds episode length 
        
        return self.state, reward, done, False, {}

    def render(self):
        # Optional rendering for visualization, not essential
        print(f"State: y1={self.state[0]}, y2={self.state[1]}")


In [9]:
enviorment=SDEEnv_train_3()
model_RPPO=RecurrentPPO("MlpLstmPolicy", enviorment, verbose=0, tensorboard_log="/ppo_recurrent_tensorboard/")

In [None]:
model_RPPO.learn(total_timesteps=500000 ,progress_bar=True)
model_RPPO.save("RecurrentPPO_V1.zip")
model_RPPO.learn(total_timesteps=500000,progress_bar=True)
model_RPPO.save("RecurrentPPO_V2.zip")


Output()