In [1]:
from mini_env4 import CustomEnv
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv
from custom_callback import SaveOnStepCallback
import os
import pandas as pd
import plotly.express as px
import numpy as np

def render(history,prices):
    # Plot the current times
    df = pd.DataFrame(history, columns=["duration", "machine","size","reward"])
    fig1 = px.bar(df, x="duration", y="machine", color="reward", orientation="h",hover_name="size")
    fig1.update_xaxes(range=[0, 96])
    fig2 = px.line(prices, line_shape="hv")
    fig2.update_xaxes(range=[0, 96])
    fig1.show()
    fig2.show()

In [4]:
# Create the environment
env = CustomEnv(w1=0.8, w2=3.5, w3=1)


# Reset the environment
obs = env.reset()

total_rew = 0
steps = 0
while True:
    #print(f"\nObs: {obs}")
    action = env.action_space.sample()
    obs, reward, terminated, _ , info = env.step(action)
    total_rew += reward
    steps += 1
    #print(f"Action: {action}")
    #print(f"Rewars: {reward}")
    if terminated:
        #print(info["rewards"])
        #print("step count:",info["step count"])
        break

env.close()
print("total_rew",total_rew)

history = np.array(info["history"])
job_counter = 0
for his in history:
    if his[2] > 0:
        job_counter += 1

print(f"completed jobs: {job_counter}/40")

render(info["history"], info["prices"])

total_rew 371.5764346764346
completed jobs: 40/40


In [5]:
# Create the save path
log_dir = "test_4_ppo_checkpoints/rew2_1000k"
os.makedirs(log_dir, exist_ok=True)

# Create the callback
save_callback = SaveOnStepCallback(save_freq=100_000, save_path=log_dir)

In [6]:
# Check the environment
check_env(env)


# Create the PPO model
model = PPO('MultiInputPolicy', env, tensorboard_log = log_dir, verbose=1)

# Wrap the environment
vec_env = DummyVecEnv([lambda: env])


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [105]:
# Train the model
model.learn(total_timesteps=500_000, callback=save_callback)

Logging to test_4_ppo_checkpoints/rew2_1000k/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 86.8     |
|    ep_rew_mean     | 518      |
| time/              |          |
|    fps             | 2570     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 86.3         |
|    ep_rew_mean          | 515          |
| time/                   |              |
|    fps                  | 1780         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0030124427 |
|    clip_fraction        | 0.0305       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.196       |
|    ex

<stable_baselines3.ppo.ppo.PPO at 0x7f9e7f399e80>

In [142]:
env_test = CustomEnv(test=True)
vec_env_test = DummyVecEnv([lambda: env_test])
test_model = PPO.load(os.path.join(log_dir,"model_1000k_steps"))

In [143]:

# Reset the environment
for env_to_test in [vec_env_test, vec_env]:
    obs = env_to_test.reset()

    total_rew = 0
    while True:
        action, _ = test_model.predict(obs, deterministic=True)
        obs, reward, terminated, info = env_to_test.step(action)
        total_rew += reward
        #print(f"\nAction: {action}")
        #print(f"Obs: {obs}")
        if terminated:
            print(f"Done: {terminated}")
            break

    env_to_test.close()
    info = info[0] # Due to wrapping
    print("total_rew",total_rew)

    history = np.array(info["history"])
    job_counter = 0
    for his in history:
        if his[2] > 0:
            job_counter += 1

    print(f"completed jobs: {job_counter}/40")
    render(info["history"], info["prices"])


Done: [ True]
total_rew [307.9947]
completed jobs: 40/40


Done: [ True]
total_rew [519.5155]
completed jobs: 40/40


In [144]:
import plotly.graph_objects as go

frames = []
for step in range(len(info["history"])):
    history_0 = [info["history"][i] for i in range(step)]
    df = pd.DataFrame(history_0, columns=["duration", "machine","size","reward"])
    frames.append(go.Frame(data=[go.Bar(x=df["duration"],y=df["machine"],orientation="h")],name=f"frame {step}"))

# Define the layout
layout = go.Layout(
    title="Animated Scatter Plot",
    xaxis=dict(range=[0, 96]),  # Set the x-axis range
    yaxis=dict(range=[-0.5, 2.5]),  # Set the y-axis range
    updatemenus=[{
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 500, 'redraw': True}, 'fromcurrent': True}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate', 'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }]
)

data = [info["history"][0]]
df = pd.DataFrame(history_0, columns=["duration", "machine","size", "reward"])

_fig = go.Figure(
    data=go.Bar(x=df["duration"],y=df["machine"],orientation="h"),
    layout=layout,
    frames=frames
)

_fig.show()
