stable baseline3 에 훈련 결과를 차트로 뽑아주는 기능이 있는지 살펴보고

In [4]:
# https://stable-baselines3.readthedocs.io/en/master/guide/algos.html

import gymnasium as gym
from gymnasium import spaces
import numpy as np

size = 4
spaces.MultiDiscrete([size+1] * (size**2)).sample()

array([4, 1, 0, 1, 2, 2, 4, 3, 4, 4, 1, 2, 2, 1, 0, 2])

In [61]:

class SudokuEnv(gym.Env):
    def __init__(self, size=4):
        self.size = size
        self.half_size = size // 2
        self.action_space = spaces.Discrete(size**3)
        self.observation_space = spaces.MultiDiscrete([size+1] * (size**2))  # 4x4 Sudoku 게임판을 1차원 배열로 표현
        self.actions = np.array([
            [[(x, y, v) for v in range(1, size+1)] for y in range(size)] for x in range(size)
        ]).reshape(-1, 3)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed, options=options)
        self.board = np.zeros(self.size**2, dtype=np.int32)
        return self.board, {}

    def step(self, action):
        x, y, num = self.actions[action]

        # 해당 위치에 숫자를 채우고 유효성을 검사
        reward = 0
        done = False
        info = {}

        # 유효성 검사
        if self.is_valid_move(x, y, num):
            self.board[x * 4 + y] = num
            if self.is_game_over():
                done = True
                reward = 1  # 승리 보상
        else:
            reward = -10  # 잘못된 행동에 대한 패널티

        return self.board, reward, done, done, info

    def is_valid_move(self, x, y, num):
        # 행, 열, 2x2 블록에 중복된 숫자가 있는지 확인
        if self.board[x * self.size + y] != 0:
            return False
        
        row_start = (x // self.half_size) * self.half_size
        col_start = (y // self.half_size) * self.half_size
        temp = self.board.reshape(self.size, self.size)

        # print(num, row_start, col_start)
        # print(self.board[x*self.size : (x+1)*self.size])
        # print(self.board[y::self.size])
        # print(temp)
        # print(temp[row_start : row_start+self.half_size, col_start : col_start+self.half_size])
        # print()

        if (num in self.board[x*self.size : (x+1)*self.size]) or \
           (num in self.board[y::self.size]):
            return False
        if  (num in temp[row_start : row_start+self.half_size,
                col_start : col_start+self.half_size]):
            return False
        
        return True

    def is_game_over(self):
        # 게임이 종료되었는지 확인
        return 0 not in self.board

    def render(self):
        # 현재 게임판 상태 출력
        # print(np.array(self.board).reshape(self.size, self.size))
        for i in range(self.size):
            print(self.board[i*self.size:i*self.size+self.size])
        print()

# 환경 테스트
env = SudokuEnv()
observation = env.reset()
env.render()
done = False
for _ in range(10):
    action = env.action_space.sample()  # 무작위 행동 선택
    observation, reward, done, _, _ = env.step(action)
    env.render()


[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 0 0 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 0 0 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 1 0 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 1 0 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 1 3 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 1 3 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 1 3 0]
[0 0 0 0]

[0 0 0 4]
[0 0 0 1]
[0 1 3 0]
[0 0 0 0]

[0 3 0 4]
[0 0 0 1]
[0 1 3 0]
[0 0 0 0]



In [62]:
from stable_baselines3.common.env_checker import check_env

env = SudokuEnv()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

In [67]:
from stable_baselines3 import PPO, A2C, DQN, SAC
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
vec_env = make_vec_env(SudokuEnv, n_envs=1, 
                       env_kwargs=dict(size=4))

# Train the agent
model = PPO("MlpPolicy", env, verbose=1).learn(500000)
# 10000 -> 40
# 100000 -> 480
# 500000 -> 2440 (약 6분 소요)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1840 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1505         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0055992883 |
|    clip_fraction        | 0.0197       |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.16        |
|    explained_variance   | 0.00307      |
|    learning_rate        | 0.0003       |
|    loss                 | 1.19e+04     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00976     |
|    va

In [68]:
# Test the trained agent
# using the vecenv
obs = vec_env.reset()
done = False
step = 0
while not done:
    action, _ = model.predict(obs) #, deterministic=True)
    step += 1
    obs, reward, done, info = vec_env.step(action)
    
    vec_env.render()
    if step % 100 == 0:
        print(f"Step {step} and action={action}")
        print("obs=", obs, "reward=", reward, "done=", done)
    if done:        
        print("Goal reached!", "reward=", reward, "step=", step)
        break




Step 100 and action=[39]
obs= [[0 0 0 0 0 0 0 0 0 4 1 0 0 0 0 0]] reward= [-10.] done= [False]
Step 200 and action=[39]
obs= [[0 0 0 0 0 0 0 0 0 4 1 0 0 0 0 0]] reward= [-10.] done= [False]
Step 300 and action=[39]
obs= [[0 0 0 0 0 0 0 0 0 4 1 0 0 3 0 0]] reward= [-10.] done= [False]
Step 400 and action=[39]
obs= [[0 0 0 0 0 0 0 0 0 4 1 0 0 3 0 0]] reward= [-10.] done= [False]
Step 500 and action=[39]
obs= [[0 0 0 0 0 0 0 0 0 4 1 0 0 3 0 0]] reward= [-10.] done= [False]
Step 600 and action=[39]
obs= [[0 0 0 0 0 0 0 0 0 4 1 0 0 3 0 0]] reward= [-10.] done= [False]
Step 700 and action=[39]
obs= [[0 0 0 0 0 0 2 0 0 4 1 0 0 3 4 0]] reward= [-10.] done= [False]
Step 800 and action=[39]
obs= [[0 0 0 0 0 0 2 0 0 4 1 0 0 3 4 0]] reward= [-10.] done= [False]
Step 900 and action=[39]
obs= [[1 0 0 0 0 0 2 0 0 4 1 0 0 3 4 0]] reward= [-10.] done= [False]
Step 1000 and action=[39]
obs= [[1 0 0 0 4 0 2 0 2 4 1 0 0 3 4 0]] reward= [-10.] done= [False]
Step 1100 and action=[39]
obs= [[1 0 0 0 4 0 2 0 