In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import joblib
import pandas as pd
from gymnasium import spaces
from xgboost import XGBClassifier

In [138]:
class CleaveEnv(gym.Env):

    metadata = {'render_modes': ['human']}

    def __init__(self, csv_path, cnn_path, xgb_path):
        super().__init__()
        
        self.cnn_surrogate = joblib.load(cnn_path)
        self.xgb_surrogate = joblib.load(xgb_path)

        self.df = pd.read_csv(csv_path)
        len_fibers = len(self.df['FiberType'].unique())
        
        self.df = pd.get_dummies(self.df, columns=['FiberType'], dtype=np.int32)
        
        self.model_features = self.cnn_surrogate.feature_names_in_

        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
        self.max_tension_change = 10.0

        
        fiber_types = self.df.iloc[:, -len_fibers:]
        other_inputs = self.df[['CleaveTension', 'Diameter']]
        total_inputs = pd.concat([other_inputs, fiber_types], axis=1)
        
        self.context_df = total_inputs
        observations_total = 1 + len(self.context_df.columns)

        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(observations_total,), dtype=np.float32)

        self.max_steps = 15
        self.current_step = 0
        self.current_context=None
        self.current_tension = 0
        self.render_mode = None

    def reset(self, seed=None, options=None):

        super().reset(seed=seed)

        self.current_context = self.context_df.sample(n=1, random_state=self.np_random)

        self.current_tension = self.current_context['CleaveTension'].iloc[0]
        self.current_step = 0

        last_cnn_pred = 0
        last_xgb_pred = 0

        observation = self._create_observation()#last_cnn_pred, last_xgb_pred)

        if self.render_mode == "human":
            print("\n---------------EPISODE RESET----------------------")
            print(f"New Scenario: Fiber = {self._get_current_fiber_type()} Start Tension = {self.current_tension:.0f}")

        return observation, {}

    def step(self, action):
        delta_tension = float(action[0] * self.max_tension_change)
        self.current_tension = self.current_tension + delta_tension
        self.current_tension = np.clip(self.current_tension, 50, 2000)
        self.current_step = self.current_step + 1

        model_inputs = self.current_context.copy()
        model_inputs['CleaveTension'] = self.current_tension
        model_inputs = model_inputs[self.model_features]

        #cnn_probs = self.cnn_surrogate.predict_proba(model_inputs)[0]
        #xgb_probs = self.xgb_surrogate.predict_proba(model_inputs)[0]
        
        cnn_pred = self.cnn_surrogate.predict(model_inputs)[0]
        xgb_pred = self.xgb_surrogate.predict(model_inputs)[0]

        terminated = False
        if cnn_pred == 1 and xgb_pred == 1:
            reward = 100.0
            terminated = True
        elif cnn_pred == 1:
            reward = 50.0
        else:
            reward = -3.0

        if abs(delta_tension) < 1.0: 
            reward -= 2.0

        SAFE_DELTA_THRESHOLD = 5.0

        if abs(delta_tension) <= SAFE_DELTA_THRESHOLD:
            reward += 1.5
        else:
            reward -= 0.25 * (abs(delta_tension) - SAFE_DELTA_THRESHOLD)

        #action_cost = 0.1 * abs(delta_tension)
        #reward = reward - action_cost

        if(xgb_pred == 0 and delta_tension < 0) or \
        (xgb_pred == 2 and delta_tension > 0) or \
        (xgb_pred == 1 and abs(delta_tension) <=1.0):
            reward += 1.0

        truncated = self.current_step >= self.max_steps
        if truncated and not terminated:
            reward = reward - 25.0

        if self.render_mode == "human":
            self.render(action, cnn_pred, xgb_pred, reward)
        observation = self._create_observation()#cnn_pred, xgb_pred)
        return observation, reward, terminated, truncated, {}

    def _get_current_fiber_type(self):
        for col_name in self.current_context.columns:
            if 'FiberType_' in col_name and self.current_context[col_name].iloc[0] == 1.0:
                return col_name.replace('FiberType_', '')
        return "Unknown"

    def _create_observation(self):#, #cnn_pred, xgb_pred):
        return np.concatenate([
            [self.current_tension],
            #[cnn_pred, xgb_pred],
            self.current_context.values[0]
        ]).astype(np.float32)

    def render(self, action, cnn_pred, xgb_pred, reward):
        action_str = f"{action[0]:+.2f}"
        cnn_str = "GOOD" if cnn_pred == 1 else "BAD"
        xgb_map = {0: "LOWER", 1: "SAME", 2: "RAISE"}
        xgb_str = xgb_map.get(xgb_pred, "????")

        print(f"Step {self.current_step:2d} Tension: {self.current_tension:6.1f} (Action: {action_str:6s}) -> CNN: {cnn_str:4s}, XGB: {xgb_str:5s} | Reward: {reward:6.1f}")
    

In [139]:
csv_path = "C:\\Users\\clombardi\\RL\\data_updated.csv"
cnn_path= "C:\\Users\\clombardi\\RL\\cnn_surrogate4.pkl"
xgb_path = "C:\\Users\\clombardi\\RL\\xgb_surrogate4.pkl"
env = CleaveEnv(csv_path=csv_path, cnn_path=cnn_path, xgb_path=xgb_path)

In [140]:
from stable_baselines3 import SAC
from stable_baselines3.common.env_checker import check_env
import os

check_env(env)

In [141]:
agent = SAC(
    "MlpPolicy",            
    env,
    device="cuda",
    verbose=0,
    buffer_size=1000000,
    ent_coef='auto',
    learning_rate=1e-4,
    batch_size=256,
    tau=0.1
)

In [142]:
agent.learn(total_timesteps=5000, progress_bar=True)

Output()

<stable_baselines3.sac.sac.SAC at 0x25deab2fac0>

In [143]:
agent_save_path = "C:\\Users\\clombardi\\RL\\agent7"
agent.save(agent_save_path)

In [144]:
trained_agent = SAC.load(agent_save_path)
eval_env = CleaveEnv(csv_path=csv_path, cnn_path=cnn_path, xgb_path=xgb_path)
eval_env.render_mode = "human"

In [149]:
for episode in range(5):
        obs, info = eval_env.reset()
        done = False
        episode_reward = 0
        while not done:
            action, _ = trained_agent.predict(obs, deterministic=True)
            
            obs, reward, terminated, truncated, info = eval_env.step(action)
            
            episode_reward += reward
            done = terminated or truncated

        print(f"Episode {episode + 1} finished with a total reward of: {episode_reward:.2f}")

eval_env.close()


---------------EPISODE RESET----------------------
New Scenario: Fiber = 400LA Start Tension = 1156
Step  1 Tension: 1165.9 (Action: +0.99 ) -> CNN: BAD , XGB: SAME  | Reward:   -4.2
Step  2 Tension: 1169.6 (Action: +0.36 ) -> CNN: BAD , XGB: SAME  | Reward:   -1.5
Step  3 Tension: 1164.9 (Action: -0.47 ) -> CNN: BAD , XGB: SAME  | Reward:   -1.5
Step  4 Tension: 1170.5 (Action: +0.56 ) -> CNN: BAD , XGB: SAME  | Reward:   -3.2
Step  5 Tension: 1164.2 (Action: -0.63 ) -> CNN: BAD , XGB: SAME  | Reward:   -3.3
Step  6 Tension: 1170.9 (Action: +0.66 ) -> CNN: BAD , XGB: SAME  | Reward:   -3.4
Step  7 Tension: 1164.1 (Action: -0.68 ) -> CNN: BAD , XGB: SAME  | Reward:   -3.4
Step  8 Tension: 1170.9 (Action: +0.68 ) -> CNN: BAD , XGB: SAME  | Reward:   -3.5
Step  9 Tension: 1164.1 (Action: -0.68 ) -> CNN: BAD , XGB: SAME  | Reward:   -3.5
Step 10 Tension: 1170.9 (Action: +0.68 ) -> CNN: BAD , XGB: SAME  | Reward:   -3.5
Step 11 Tension: 1164.1 (Action: -0.68 ) -> CNN: BAD , XGB: SAME  | R