# 1. 패키지 설치

In [None]:
pip install stable_baselines3

In [None]:
import numpy as np
import gymnasium
from gymnasium import Env
from stable_baselines3 import PPO
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
import os

# 2. 환경 정의

In [None]:
n_jobs=5
n_machines=3
processing_times =[]
prosses_num=5
for i in range(prosses_num):
  processing_times.append(np.random.randint(1, 50, size=n_jobs).tolist())

In [None]:
class JobSchedulingEnv(Env):
    def __init__(self, n_jobs, n_machines):
        self.n_jobs = n_jobs
        self.n_machines = n_machines
        # Observation space: Each job's processing time
        #빈칸을 채우세요
        # Randomly initialize processing times for jobs
        self.processing_times = processing_times[0]#np.random.randint(1, 50, size=n_jobs)
        # Action space: Assign each job to a machine
        self.action_space = MultiDiscrete([n_machines] * n_jobs)
        self.current_time=1
        self.machine_times=np.zeros(self.n_machines)

    def reset(self,seed=None):
        #빈칸을 채우세요
        return self.processing_times,{}

    def step(self, action):
        # action에 의해 할당된 작업 및 각 기계의 스케줄을 생성
        for job, machine in enumerate(action):
            self.machine_times[machine] += self.processing_times[job]

        #모든 기계 중 작업이 가장 늦게 끝나는 기계의 총 작업 시간
        makespan = np.max(self.machine_times)

        # Reward를 작업 시간을 최소화하도록 설정 (makespan: 작업 시작부터 끝까지 경과된 시간의 길이)
        #빈칸을 채우세요

        # Check if the episode is done
        if self.current_time==prosses_num:
            #빈칸을 채우세요
            self.processing_times=np.zeros(n_machines).tolist()
        else:
            #빈칸을 채우세요
            self.processing_times=processing_times[self.current_time]#np.random.randint(1, 50, size=self.n_jobs)
        # Additional information
        info = {}
        truncated=False
        self.current_time+=1
        next_state=self.processing_times

        return next_state, reward, done, truncated,info

    def render(self, mode='human'):
        pass

In [None]:
import plotly.express as px
import pandas as pd
from datetime import datetime, timedelta

def render_schedule(schedule, job_times, n_machines):
    """
    schedule: list of machine assignments for each job
    job_times: list of processing times for each job
    n_machines: number of machines
    """
    start_time = datetime(2024, 8, 29, 0, 0, 0)
    # Create a list of dictionaries to hold the job schedule data
    data = []
    machine_end_times = [start_time] * n_machines  # 각 기계의 종료 시간을 초기화
    for i in range(len(schedule)):
        for job_id, machine_id in enumerate(schedule[i]):
            start_time = machine_end_times[machine_id]
            end_time = start_time + timedelta(seconds=job_times[i][job_id])
            data.append(dict(Task=f"Machine {machine_id + 1}",
                            Start=start_time,
                            Finish=end_time,
                            Job=f"Job {job_id + 1}",Index=i))
            machine_end_times[machine_id] = end_time  # 업데이트 기계의 종료 시간
    # Convert to DataFrame
    df = pd.DataFrame(data)
    # Plot the Gantt chart
    fig = px.timeline(df, x_start="Start", x_end="Finish", y="Job", color="Task", title="Gantt Chart",hover_data=["Index"])
    #print([i for i in range(len(job_times[0]))])
    fig.update_yaxes(categoryorder="category ascending")#categoryorder="array", categoryarray=[i+1 for i in range(len(job_times[0]))])
    #fig.update_yaxes()
    fig.show()

# 3. 환경 테스트

In [None]:
# Test the environment
env = JobSchedulingEnv(n_jobs=n_jobs, n_machines=n_machines)
obs,_ = env.reset()
done = False
schedule=[]
job_times=[]
score=0
while not done:
    #env.render()
    action = env.action_space.sample()
    obs, reward,done,_, info = env.step(action)
    schedule.append(action.tolist())
    job_times.append(obs)
    score+=reward
# Example usage
print(processing_times,"job_times")
print(schedule,"schedule")
render_schedule(schedule, processing_times, n_machines)

[[22, 31, 14, 18, 4], [10, 37, 19, 36, 29], [41, 49, 31, 22, 27], [4, 38, 19, 32, 9], [9, 42, 16, 12, 39]] job_times
[[0, 2, 1, 1, 2], [2, 0, 2, 0, 1], [2, 2, 1, 2, 1], [2, 0, 1, 2, 1], [1, 2, 1, 2, 2]] schedule


# 4. 강화학습 모델 정의 및 학습



In [None]:
env=JobSchedulingEnv(n_jobs=n_jobs,n_machines=n_machines)
log_path = os.path.join('Training', 'Logs','PPO')

In [None]:
model = PPO('MlpPolicy', env, verbose=1,tensorboard_log=log_path)
#model = PPO('MlpPolicy', env, verbose = 1)
# model = PPO('MlpPolicy', env, verbose = 1,tensorboard_log=log_path,nsteps=1000,batch_size=100)
model.learn(total_timesteps=200000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training/Logs/PPO/PPO_11
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5        |
|    ep_rew_mean     | -862     |
| time/              |          |
|    fps             | 473      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5           |
|    ep_rew_mean          | -861        |
| time/                   |             |
|    fps                  | 367         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007303755 |
|    clip_fraction        | 0.0379      |
|    clip_range           

<stable_baselines3.ppo.ppo.PPO at 0x78a8a104d5d0>

# 5.학습 과정 시각화

In [None]:
%load_ext tensorboard
#%reload_ext tensorboard
training_log_path = os.path.join(log_path)
# training_log_path = os.path.join(log_path,'PPO_2')
%tensorboard --logdir={training_log_path} --port=6009

# 6.학습된 강화학습 모델 평가

In [None]:
# Test the environment
observation,_ = env.reset()
done = False
schedule=[]
job_times=[]
score=0
while not done:
    #env.render()
    action=model.predict(observation)[0]
    #action = env.action_space.sample() #
    obs, reward,done,_, info = env.step(action)
    schedule.append(action.tolist())
    #job_times.append(obs))
    score+=reward
# Example usage
print(processing_times,"job_times")
print(schedule,"schedule")
render_schedule(schedule, processing_times, n_machines)

[[22, 31, 14, 18, 4], [10, 37, 19, 36, 29], [41, 49, 31, 22, 27], [4, 38, 19, 32, 9], [9, 42, 16, 12, 39]] job_times
[[2, 1, 0, 0, 2], [2, 1, 0, 0, 2], [2, 1, 0, 0, 1], [2, 1, 0, 0, 2], [2, 1, 0, 0, 2]] schedule


# 7.모델 저장 후 재 학습

In [None]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model') # 저장 경로 및 파일명 지정
model.save(PPO_path)

In [None]:
model = PPO.load('PPO_model', env=env)

In [None]:
model.learn(total_timesteps=5000,tb_log_name='result',reset_num_timesteps=False)

#8. 다른 강화학습 알고리즘 사용


In [None]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=100000)