stable baseline3 에 훈련 결과를 차트로 뽑아주는 기능이 있는지 살펴보고
-> wandb callback

In [1]:
# https://stable-baselines3.readthedocs.io/en/master/guide/algos.html

import gymnasium as gym
from gymnasium import spaces
import numpy as np

size = 4
spaces.MultiDiscrete([size+1] * (size**2)).sample()

array([1, 0, 4, 3, 1, 3, 2, 0, 4, 2, 0, 3, 0, 3, 0, 0])

In [2]:

class SudokuEnv(gym.Env):
    def __init__(self, size=4):
        self.size = size
        self.half_size = size // 2
        self.action_space = spaces.Discrete(size**3)
        self.observation_space = spaces.MultiDiscrete([size+1] * (size**2))  # 4x4 Sudoku 게임판을 1차원 배열로 표현
        self.actions = np.array([
            [[(x, y, v) for v in range(1, size+1)] for y in range(size)] for x in range(size)
        ]).reshape(-1, 3)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed, options=options)
        self.board = np.zeros(self.size**2, dtype=np.int32)
        return self.board, {}

    def step(self, action):
        x, y, num = self.actions[action]

        # 해당 위치에 숫자를 채우고 유효성을 검사
        reward = 0
        done = False
        info = {}

        # 유효성 검사
        if self.is_valid_move(x, y, num):
            self.board[x * 4 + y] = num
            if self.is_game_over():
                done = True
                reward = 1  # 승리 보상
            else:
                reward = -10
        else:
            reward = -10  # 잘못된 행동에 대한 패널티

        return self.board, reward, done, done, info

    def is_valid_move(self, x, y, num):
        # 행, 열, 2x2 블록에 중복된 숫자가 있는지 확인
        # if self.board[x * self.size + y] != 0:
        #     return False
        
        row_start = (x // self.half_size) * self.half_size
        col_start = (y // self.half_size) * self.half_size
        temp = self.board.reshape(self.size, self.size)

        # print(num, row_start, col_start)
        # print(self.board[x*self.size : (x+1)*self.size])
        # print(self.board[y::self.size])
        # print(temp)
        # print(temp[row_start : row_start+self.half_size, col_start : col_start+self.half_size])
        # print()

        if (num in self.board[x*self.size : (x+1)*self.size]) or \
           (num in self.board[y::self.size]):
            return False
        if  (num in temp[row_start : row_start+self.half_size,
                col_start : col_start+self.half_size]):
            return False
        
        return True

    def is_game_over(self):
        # 게임이 종료되었는지 확인
        return 0 not in self.board

    def render(self):
        # 현재 게임판 상태 출력
        # print(np.array(self.board).reshape(self.size, self.size))
        for i in range(self.size):
            print(self.board[i*self.size:i*self.size+self.size])
        print()

# 환경 테스트
env = SudokuEnv()
observation = env.reset()
env.render()
done = False
for i in range(400):
    action = env.action_space.sample()  # 무작위 행동 선택
    observation, reward, done, _, _ = env.step(action)
    if i % 100 == 0:
        env.render()


[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]

[0 0 0 1]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]

[4 2 3 1]
[0 1 4 2]
[0 3 2 4]
[2 4 1 3]

[4 2 3 1]
[0 1 4 2]
[1 3 2 4]
[2 4 1 3]

[4 2 3 1]
[3 1 4 2]
[1 3 2 4]
[2 4 1 3]



In [3]:
from stable_baselines3.common.env_checker import check_env

env = SudokuEnv()
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

In [21]:
# from stable_baselines3 import TRPO
from sb3_contrib import TRPO
from stable_baselines3.common.env_util import make_vec_env
# from stable_baselines3.common.callbacks import EpsilonGreedyCallback

NUM_ENVS = 1024 
# 늘리면 조금씩 더 나은 결과가 나오지만 
# 그냥 학습 횟수를 늘리는 게 더 나음
# GPU 사용량도 변화 없음

# Instantiate the env
vec_env = make_vec_env(SudokuEnv, n_envs=NUM_ENVS, 
                       env_kwargs=dict(size=4))

# Train the agent
model = TRPO("MlpPolicy", env, verbose=1).learn(300000)
# 10000 -> 4 n_updates
# 100000 -> 48
# 500000 -> 2440 (약 6분 소요)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 282       |
|    ep_rew_mean     | -2.81e+03 |
| time/              |           |
|    fps             | 1801      |
|    iterations      | 1         |
|    time_elapsed    | 1         |
|    total_timesteps | 2048      |
----------------------------------
-----------------------------------------
| rollout/                  |           |
|    ep_len_mean            | 300       |
|    ep_rew_mean            | -2.99e+03 |
| time/                     |           |
|    fps                    | 1715      |
|    iterations             | 2         |
|    time_elapsed           | 2         |
|    total_timesteps        | 4096      |
| train/                    |           |
|    explained_variance     | 0.0028    |
|    is_line_search_success | 1         |
|    kl_divergence_loss     | 0.00285   |
|    lear

In [22]:
model.save("TRPO_Sudoku")
model = TRPO.load("TRPO_Sudoku", verbose=1)

In [31]:
# Test the trained agent
# using the vecenv
obs = env.reset()[0]
done = False
step = 0
while not done:
    action, _ = model.predict(obs) #, deterministic=True)
    step += 1
    obs, reward, done, _, info = env.step(action)

    if step % 50 == 0:
        print(f"Step {step} and action={action}")
        print("obs=", obs, "action=", env.actions[action],
              "reward=", reward, "done=", done)
    if done:        
        print("Goal reached!", "reward=", reward, "step=", step, "obs=", obs)
        break


Step 50 and action=29
obs= [4 0 1 2 1 2 3 4 2 4 0 0 0 3 4 0] action= [1 3 2] reward= -10 done= False
Step 100 and action=32
obs= [4 0 1 2 1 2 3 4 2 4 0 1 0 3 4 0] action= [2 0 1] reward= -10 done= False
Step 150 and action=12
obs= [4 0 1 2 1 2 3 4 2 1 0 3 0 3 2 1] action= [0 3 1] reward= -10 done= False
Step 200 and action=32
obs= [4 0 1 2 1 2 3 4 2 1 0 3 0 3 2 1] action= [2 0 1] reward= -10 done= False
Goal reached! reward= 1 step= 243 obs= [4 3 1 2 1 2 3 4 2 1 4 3 3 4 2 1]
