# Production System Assignment

## Production System with PSP and RL agent optimization

In [1]:
from __future__ import annotations

import random
import statistics
from collections.abc import Callable, Sequence

import numpy as np
import simpy
from scipy import stats
from simpy.events import ProcessGenerator
from lib.server import Server
from lib.job import Job
from matplotlib import pyplot as plt
from lib.config import SEEDS
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.running_mean_std import RunningMeanStd
import os

In [2]:
NUM_MACHINES = 6

In [5]:
class SystemEnv(gym.Env):
    def __init__(
        self,
        inter_arrival_time_distribution: Callable[[], float],
        processing_time_per_family_distribution: list[Callable[[], float]],
        families_distribution: Callable[[], float],
        due_dates_distribution: Callable[[], float],
        routing_distribution: dict[int, list[Callable[[], float]]],
        routing_prob: dict[int, list[float]],
        release_interval: float = 5.0,
        episode_duration: float = 60 * 24 * 7,
        reward_weights: dict[str, float] = None
    ) -> None:
        super().__init__()
        self.env: simpy.Environment | None = None
        self.inter_arrival_time_distribution = inter_arrival_time_distribution
        self.processing_time_per_family_distribution = processing_time_per_family_distribution
        self.families_distribution = families_distribution
        self.due_dates_distribution = due_dates_distribution
        self.routing_distribution = routing_distribution
        self.routing_prob = routing_prob
        self.release_interval = release_interval
        self.episode_duration = episode_duration

        self.running_throughput = RunningMeanStd()
        self.running_tardiness = RunningMeanStd()
        self.running_wip = RunningMeanStd()

        self.reward_weights = reward_weights if reward_weights is not None else {
            'throughput': 1.0,
            'wip_penalty': -1.0,
            'tardiness_penalty': -1.0
        }

        low_bounds = np.array([
            0.0,
            0.0,
            -5000.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0
        ], dtype=np.float32)

        high_bounds = np.array([
            1.0,
            500.0,
            5000.0,
            1000.0,
            10000.0,
            10000.0,
            10000.0,
            10000.0,
            10000.0,
            10000.0
        ], dtype=np.float32)

        self.observation_space = spaces.Box(low=low_bounds, high=high_bounds, dtype=np.float32)

        # action space: 0 = Don't Release, 1 = Release
        self.action_space = spaces.Discrete(2)

        self.machines: list[Server] = []

        self.jobs: list[Job] = []

        self.pre_shop_pool: list[Job] = []

        self.idx_counter = 0

        self.jobs_completed_this_episode = 0
        self.total_tardiness_this_episode = 0.0
        self.current_wip = 0

        self.last_wip_update_time = 0.0
        self.cumulative_wip_area = 0.0
        self.current_interval_wip_area = 0.0
        self.last_interval_wip_update_time = 0.0


    def _set_seed(self, seed: int | None = None):
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

    def reset(self, seed: int | None = None, options: dict | None = None) -> tuple[np.ndarray, dict]:
        super().reset(seed=seed)
        self._set_seed(seed)

        self.env = simpy.Environment()
        self.machines = [Server(self.env, 1, f"WC{i+1}") for i in range(6)]
        self.pre_shop_pool = []
        self.jobs = []

        self.idx_counter = 0

        self.jobs_completed_this_episode = 0
        self.total_tardiness_this_episode = 0.0
        self.current_wip = 0

        self.last_wip_update_time = 0.0
        self.cumulative_wip_area = 0.0
        self.current_interval_wip_area = 0.0
        self.last_interval_wip_update_time = 0.0

        self.env.process(self._run_job_arrivals())

        observation = self._get_obs()
        info = self._get_info()

        return observation, info

    def step(self, action: int) -> tuple[np.ndarray, float, bool, bool, dict]:
        assert self.action_space.contains(action), f"Non valid action: {action}"

        jobs_completed_before_step = self.jobs_completed_this_episode
        tardiness_before_step = self.total_tardiness_this_episode

        time_since_last_total_wip_update = self.env.now - self.last_wip_update_time
        if time_since_last_total_wip_update > 0:
            self.cumulative_wip_area += self.current_wip * time_since_last_total_wip_update
        self.last_wip_update_time = self.env.now

        self.current_interval_wip_area = 0.0
        self.last_interval_wip_update_time = self.env.now

        if action == 1:
            self._release_job_from_psp()

        self.env.run(until=self.env.now + self.release_interval)

        time_since_last_interval_wip_update = self.env.now - self.last_interval_wip_update_time
        if time_since_last_interval_wip_update > 0:
            self.current_interval_wip_area += self.current_wip * time_since_last_interval_wip_update

        avg_wip_for_reward_interval = self.current_interval_wip_area / self.release_interval if self.release_interval > 0 else 0.0


        jobs_completed_in_interval = self.jobs_completed_this_episode - jobs_completed_before_step
        tardiness_in_interval = self.total_tardiness_this_episode - tardiness_before_step

        # self.current_wip = len([job for job in self.jobs if job.in_system and not job.done])
        # if self.current_wip != 0:
          #  print(f"Current wip: {self.current_wip}")

        self.running_throughput.update(np.array([[jobs_completed_in_interval]]))
        self.running_tardiness.update(np.array([[tardiness_in_interval]]))
        self.running_wip.update(np.array([[avg_wip_for_reward_interval]]))

        reward = self._calculate_reward(
            jobs_completed_in_interval,
            tardiness_in_interval,
            avg_wip_for_reward_interval
        )

        terminated = self.env.now >= self.episode_duration
        truncated = False

        observation = self._get_obs()
        info = self._get_info()

        if terminated:
            final_time_diff = self.episode_duration - self.last_wip_update_time
            if final_time_diff > 0:
                 self.cumulative_wip_area += self.current_wip * final_time_diff

            average_wip_for_episode = self.cumulative_wip_area / self.episode_duration if self.episode_duration > 0 else 0
            info["average_wip_for_episode"] = average_wip_for_episode
            info["total_simulation_time_minutes"] = self.env.now

        return observation, reward, terminated, truncated, info

    def _get_obs(self) -> np.ndarray:
        presence_job_top = 1.0 if self.pre_shop_pool else 0.0

        job_top_proc_time = 0.0
        job_top_urgency = 0.0
        if presence_job_top == 1.0:
            job_top = self.pre_shop_pool[0]
            for m in job_top.routing:
                job_top_proc_time += job_top.process_time
            job_top_urgency = job_top.due_date - self.env.now

        raw_wip = self.current_wip

        machine_workloads = []
        for machine in self.machines:
            current_machine_workload = 0.0

            if machine.job_on_machine is not None:
                time_spent_on_op = self.env.now - machine.job_start_time
                remaining_op_time = machine.job_on_machine.process_time - time_spent_on_op
                current_machine_workload += max(0, remaining_op_time)

            for request in machine.queue:
                job_in_queue = request.associated_job
                current_machine_workload += job_in_queue.process_time

            machine_workloads.append(current_machine_workload)

        return np.array([
            presence_job_top,
            job_top_proc_time,
            job_top_urgency,
            float(raw_wip),
            *machine_workloads
        ], dtype=np.float32)

    def _run_job_arrivals(self) -> ProcessGenerator:
        while True:
            timeout_inter_arrival = self.inter_arrival_time_distribution()

            weight = self.families_distribution()
            if weight <= 0.1:
                family = 1
            elif weight <= 0.62:
                family = 2
            else:
                family = 3
            processing_time = self.processing_time_per_family_distribution[family-1]()
            due_date_offset = self.due_dates_distribution()

            family_routing_distr = self.routing_distribution[family]
            family_routing_prob = self.routing_prob[family]

            job_routing = []
            for i in range(6):
                if family_routing_distr[i]() <= family_routing_prob[i]:
                    job_routing.append(self.machines[i])

            yield self.env.timeout(timeout_inter_arrival)

            job = Job(
                env=self.env,
                routing=job_routing,
                arrival_time=self.env.now,
                process_time=processing_time,
                due_date=(self.env.now + due_date_offset),
                idx=self.idx_counter,
                family="F{}".format(family),
                completion_callback=self._job_completed_callback
            )

            # job_proc_time = 0
            # for m in job.routing:
                # job_proc_time += job.process_time

            # print(job_proc_time)

            self.idx_counter += 1
            self.jobs.append(job)
            self.pre_shop_pool.append(job)

    def _release_job_from_psp(self):
        if self.pre_shop_pool:
            job_to_release = self.pre_shop_pool.pop(0)

            time_diff_total = self.env.now - self.last_wip_update_time
            if time_diff_total > 0:
                self.cumulative_wip_area += self.current_wip * time_diff_total
            self.last_wip_update_time = self.env.now

            time_diff_interval = self.env.now - self.last_interval_wip_update_time
            if time_diff_interval > 0:
                self.current_interval_wip_area += self.current_wip * time_diff_interval
            self.last_interval_wip_update_time = self.env.now

            job_to_release.in_system = True
            self.current_wip += 1
            self.env.process(job_to_release.main())


    def _job_completed_callback(self, job: Job):
        time_diff_total = self.env.now - self.last_wip_update_time
        if time_diff_total > 0:
            self.cumulative_wip_area += self.current_wip * time_diff_total
        self.last_wip_update_time = self.env.now

        time_diff_interval = self.env.now - self.last_interval_wip_update_time
        if time_diff_interval > 0:
            self.current_interval_wip_area += self.current_wip * time_diff_interval
        self.last_interval_wip_update_time = self.env.now

        job.in_system = False
        self.current_wip -= 1
        self.jobs_completed_this_episode += 1
        self.total_tardiness_this_episode += job.tardiness


    def _calculate_reward(self, jobs_completed_in_interval: int, tardiness_in_interval: float, avg_wip_for_reward_interval: int) -> float:

        norm_throughput = (jobs_completed_in_interval - self.running_throughput.mean[0]) / (np.sqrt(self.running_throughput.var[0]) + 1e-8)
        norm_tardiness = (tardiness_in_interval - self.running_tardiness.mean[0]) / (np.sqrt(self.running_tardiness.var[0]) + 1e-8)
        norm_wip = (avg_wip_for_reward_interval - self.running_wip.mean[0]) / (np.sqrt(self.running_wip.var[0]) + 1e-8)

        norm_throughput = np.clip(norm_throughput, -10, 10)
        norm_tardiness = np.clip(norm_tardiness, -10, 10)
        norm_wip = np.clip(norm_wip, -10, 10)

        reward = (self.reward_weights['throughput'] * norm_throughput) + \
                 (self.reward_weights['tardiness_penalty'] * norm_tardiness) + \
                 (self.reward_weights['wip_penalty'] * norm_wip)
        return reward

    def _get_info(self) -> dict:
        info = {
            "current_time": self.env.now,
            "jobs_in_psp": len(self.pre_shop_pool),
            "jobs_in_system_wip": self.current_wip,
            "jobs_completed_episode": self.jobs_completed_this_episode,
            "total_tardiness_episode": self.total_tardiness_this_episode
        }
        return info


In [6]:
def make_env():
    return SystemEnv(
        inter_arrival_time_distribution=lambda: random.expovariate(lambd=0.65),
        processing_time_per_family_distribution=[
            lambda: random.gammavariate(2,2),
            lambda: random.gammavariate(4,0.5),
            lambda: random.gammavariate(6,1/6)
        ],
        families_distribution=lambda: random.random(),
        routing_distribution={
            1: [lambda: random.random(), lambda: random.random(), lambda: random.random(),lambda: random.random(),lambda: random.random(),lambda: random.random()],
            2: [lambda: random.random(), lambda: random.random(), lambda: random.random(),lambda: random.random(),lambda: random.random(),lambda: random.random()],
            3: [lambda: random.random(), lambda: random.random(), lambda: random.random(),lambda: random.random(),lambda: random.random(),lambda: random.random()]
        },
        routing_prob={
            1: [1,1,0,1,1,1],
            2: [0.8, 0.8, 1, 0.8, 0.8, 0.75],
            3: [0,0,1,0,0,0.75]
        },
        due_dates_distribution=lambda: random.uniform(30,50),
        reward_weights={
            'throughput': 1.0,
            'wip_penalty': -0.5,
            'tardiness_penalty': -0.5
        }
    )

num_envs = 4
seed = 42

vec_env = make_vec_env(make_env, n_envs=num_envs, seed=seed)
vec_env = VecNormalize(
    vec_env,
    norm_obs=True,
    norm_reward=True,
    clip_obs=10,
    clip_reward=10,
    gamma=0.99
)

model = DQN(
    "MlpPolicy",
    vec_env,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=50000,
    learning_starts=1000,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=(1, "step"),
    gradient_steps=1,
    exploration_fraction=0.1,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    target_update_interval=1000,
    seed=seed
)

print("Starting DQN training...")
total_timesteps = 1000000
model.learn(total_timesteps=total_timesteps, log_interval=10)
print("DQN training finished")

model.save("production_system_agent_dqn_final")
vec_env.save("vec_normalize_stats_dqn_final.pkl")

Using cpu device
Starting DQN training...
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.02e+03 |
|    ep_rew_mean      | -234     |
|    exploration_rate | 0.77     |
| time/               |          |
|    episodes         | 10       |
|    fps              | 1030     |
|    time_elapsed     | 23       |
|    total_timesteps  | 24192    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000573 |
|    n_updates        | 5797     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.02e+03 |
|    ep_rew_mean      | -133     |
|    exploration_rate | 0.617    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 923      |
|    time_elapsed     | 43       |
|    total_timesteps  | 40320    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss    

In [7]:
print("\nStarting evaluation phase...")

eval_env = make_vec_env(make_env, n_envs=1, seed=seed + 1)

eval_env = VecNormalize.load("vec_normalize_stats_dqn_final.pkl", eval_env)

eval_env.training = False
eval_env.norm_reward = False

model = DQN.load("production_system_agent_dqn_final", env=eval_env)

num_eval_episodes = 10
episode_rewards = []
episode_tardiness = []
episode_throughput = []
episode_avg_wip = []

episode_hourly_throughput = []
episode_hourly_tardiness = []
episode_hourly_wip = []

for episode in range(num_eval_episodes):
    obs = eval_env.reset()
    print(f"Type of reset_result: {type(obs)}")
    print(f"Value of reset_result: {obs}")

    done = False
    total_reward = 0
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done_status_array, info = eval_env.step(action)
        print(obs, reward, done_status_array, info)
        total_reward += reward[0]
        if done_status_array[0]:
            episode_rewards.append(total_reward)
            episode_tardiness.append(info[0]["total_tardiness_episode"])
            episode_throughput.append(info[0]["jobs_completed_episode"])
            if "average_wip_for_episode" in info[0]:
                episode_avg_wip.append(info[0]["average_wip_for_episode"])
            else:
                print("Warning: 'average_wip_for_episode' not found in info dict for this episode.")

            current_simulation_time_minutes = info[0].get("total_simulation_time_minutes", 60 * 24 * 7)
            episode_duration_hours = current_simulation_time_minutes / 60.0
            if episode_duration_hours > 0:
                hourly_throughput = info[0]["jobs_completed_episode"] / episode_duration_hours
                episode_hourly_throughput.append(hourly_throughput)

                hourly_tardiness = info[0]["total_tardiness_episode"] / episode_duration_hours
                episode_hourly_tardiness.append(hourly_tardiness)

                episode_hourly_wip.append(info[0]["average_wip_for_episode"])
            else:
                episode_hourly_throughput.append(0.0)
                episode_hourly_tardiness.append(0.0)
                episode_hourly_wip.append(0.0)
            done = True

print(f"\nEvaluation on {num_eval_episodes} episodes:")
print(f"Average Reward per episode: {np.mean(episode_rewards):.2f}")
print(f"Average total tardiness per episode: {np.mean(episode_tardiness):.2f}")
print(f"Average Throughput per episode: {np.mean(episode_throughput):.2f}")
if episode_avg_wip:
    print(f"Average WIP per episode: {np.mean(episode_avg_wip):.2f}")

print("\n--- Hourly Metrics (Averages per Episode) ---")
if episode_hourly_throughput:
    print(f"Average Hourly Throughput: {np.mean(episode_hourly_throughput):.2f} jobs/hour")
else:
    print("No data for Average Hourly Throughput.")

if episode_hourly_tardiness:
    print(f"Average Hourly Tardiness: {np.mean(episode_hourly_tardiness):.2f} units/hour")
else:
    print("No data for Average Hourly Tardiness.")

if episode_hourly_wip:
    print(f"Average Hourly WIP (mean jobs in system): {np.mean(episode_hourly_wip):.2f} jobs")
else:
    print("No data for Average Hourly WIP.")

eval_env.close()
vec_env.close()



Starting evaluation phase...
Type of reset_result: <class 'numpy.ndarray'>
Value of reset_result: [[-10.          -0.94595146   1.6204917   -0.4733806   -0.06213495
   -0.13555998  -0.15689595  -0.16275582  -0.16103365  -0.15972446]]
[[ 0.02304929  0.16904654  1.6353685  -0.4733806  -0.06213495 -0.13555998
  -0.15689595 -0.16275582 -0.16103365 -0.15972446]] [0.] [False] [{'current_time': 5.0, 'jobs_in_psp': 5, 'jobs_in_system_wip': 0, 'jobs_completed_episode': 0, 'total_tardiness_episode': 0.0, 'TimeLimit.truncated': False}]
[[ 0.02304929 -0.07322188  1.6325777   1.4366674  -0.06213495 -0.13555998
  -0.15689595  2.9610655  -0.16103365 -0.15972446]] [-0.499975] [False] [{'current_time': 10.0, 'jobs_in_psp': 9, 'jobs_in_system_wip': 1, 'jobs_completed_episode': 0, 'total_tardiness_episode': 0.0, 'TimeLimit.truncated': False}]
[[ 0.02304929 -0.07322188  1.6307765   1.4366674  -0.06213495 -0.13555998
  -0.15689595 -0.16275582 -0.16103365  2.098206  ]] [-0.35354453] [False] [{'current_time