In [None]:
!pip install gymnasium stable-baselines3 numpy

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class TicTacToeEnv(gym.Env):
    def __init__(self):
        super(TicTacToeEnv, self).__init__()
        # Define the action space: 9 discrete actions corresponding to each cell in the 3x3 grid
        self.action_space = spaces.Discrete(9)

        # Observation space: 3x3 grid, where each cell can have one of three values (0: empty, 1: player 1, 2: player 2)
        self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=np.int32)

        # Initialize the board state
        self.state = np.zeros((3, 3), dtype=np.int32)

        # Set the current player (1 for player 1, 2 for player 2)
        self.current_player = 1

    def reset(self, seed=None, options=None):
        # Reset the board to an empty state and set the starting player
        self.state = np.zeros((3, 3), dtype=np.int32)
        self.current_player = 1
        return self.state, {}

    def step(self, action):
        # Convert action (0-8) to 2D grid position
        row, col = divmod(action, 3)

        # Check if the action is valid (the cell is empty)
        if self.state[row, col] != 0:
            return self.state, -10, True, False, {"invalid": True}  # Invalid move, end episode with penalty

        # Place the current player's mark on the board
        self.state[row, col] = self.current_player

        # Check if the game is won or if it's a draw
        if self._check_win(self.current_player):
            return self.state, 1, True, False, {}  # Winning move

        if self._check_draw():
            return self.state, 0, True, False, {}  # Draw

        # Switch players
        self.current_player = 2 if self.current_player == 1 else 1
        return self.state, 0, False, False, {}  # Continue the game

    def render(self, mode="human"):
        # Print the board to the console
        symbols = {0: " ", 1: "X", 2: "O"}
        print("\n".join(["|".join([symbols[cell] for cell in row]) for row in self.state]))
        print()

    def close(self):
        pass

    def _check_win(self, player):
        # Check rows, columns, and diagonals for a win
        for i in range(3):
            if np.all(self.state[i, :] == player) or np.all(self.state[:, i] == player):
                return True
        if np.all(np.diag(self.state) == player) or np.all(np.diag(np.fliplr(self.state)) == player):
            return True
        return False

    def _check_draw(self):
        # Check if all cells are filled and there's no winner
        return np.all(self.state != 0)


In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

# Create the environment
env = TicTacToeEnv()

# Check if the environment follows the Gymnasium API
check_env(env, warn=True)

# Create and train the PPO model
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=1000000)

# Test the trained agent
obs, _ = env.reset()
for _ in range(9):  # Maximum 9 moves in a Tic Tac Toe game
    action, _states = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)
    env.render()
    if done:
        if reward == 1:
            print("The agent won!")
        elif reward == 0:
            print("It's a draw!")
        else:
            print("The agent made an invalid move.")
        break