In [29]:
import gym
from gym import error, spaces, utils
from gym.spaces import space
from gym.utils import seeding

import random

class TicTacToeEnv(gym.Env):
  metadata = {'render.modes': ['human']}

  def __init__(self):
    self.state = [
        ["-","-","-"],
        ["-","-","-"],
        ["-","-","-"]
        ]
    self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=np.int)
    self.action_space = spaces.Discrete(9)


  def hash(self):
    return "".join([item for sublist in self.state for item in sublist])


  def available_actions(self):
    return [i for i, x in enumerate(self.hash()) if x == "-"]

  def available_states(self, player):
    states = []
    actions = self.available_actions()
    for action in actions:
      state_list = list(self.hash())
      state_list[action] = player
      state = "".join(state_list)
      _, reward = self.check_done(state)
      states.append((state, reward))
    return states


  def check_done(self, state):
    winner = ""
    for player in ["X", "O"]:
        if (state[0:3] == 3*player):
            winner = player
        elif (state[3:6] == 3*player):
            winner = player
        elif (state[6:9] == 3*player):
            winner = player
        elif (state[0] == player and state[3] == player and state[6] == player):
            winner = player
        elif (state[1] == player and state[4] == player and state[7] == player):
            winner = player
        elif (state[2] == player and state[5] == player and state[8] == player):
            winner = player
        elif (state[0] == player and state[4] == player and state[8] == player):
            winner = player
        elif (state[2] == player and state[4] == player and state[6] == player):
            winner = player

    if (winner == "X"):
      return True, 10
    elif (winner == "O"):
      return True, -10
    elif "-" not in state:
        return True, 0
    else:
        return False, 0




  def step(self, action):
    # Determine which player's turn it is based on the total number of "X" and "O" in the state
    x_count = sum(row.count("X") for row in self.state)
    o_count = sum(row.count("O") for row in self.state)
    player = "X" if x_count <= o_count else "O"

    self.state[action // 3][action % 3] = player

    done, reward = self.check_done(self.hash())
    obs = np.array(self.state_to_int())  # Convert the state to an array

    return obs, reward, done, {}

  def reset(self):
    self.state = [
        ["-","-","-"],
        ["-","-","-"],
        ["-","-","-"]
        ]
    return np.array(self.state_to_int())
  def state_to_int(self):
    # Convert the state to integers (0 for empty, 1 for "X", 2 for "O")
    return [[0 if cell == "-" else (1 if cell == "X" else 2) for cell in row] for row in self.state]



  def render(self, mode='human'):
    print("Board")
    for row in self.state:
      print(row)

In [34]:
!pip install stable_baselines3
!pip install 'shimmy>=0.2.1'



In [35]:
import gym
import numpy as np
from stable_baselines3 import DQN

env = TicTacToeEnv()

# Create a DQN model
model = DQN("MlpPolicy", env, verbose=1)

# Train the agent
total_timesteps = 100000
model.learn(total_timesteps=total_timesteps)

# Save the trained model
model.save("dqn_tictactoe")

# Test the trained agent
obs = env.reset()
for _ in range(10):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)  # Pass action as a single value, not in a list
    env.render()
    if done:
        obs = env.reset()

# Close the environment
env.close()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.25     |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10857    |
|    time_elapsed     | 0        |
|    total_timesteps  | 33       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.62     |
|    ep_rew_mean      | 5        |
|    exploration_rate | 0.993    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7116     |
|    time_elapsed     | 0        |
|    total_timesteps  | 69       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 9        |
|    ep_rew_mean      | 3.33   

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=np.int)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.3     |
|    ep_rew_mean      | 2.3      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4032     |
|    fps              | 4066     |
|    time_elapsed     | 11       |
|    total_timesteps  | 48749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.4     |
|    ep_rew_mean      | 2.2      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4036     |
|    fps              | 4068     |
|    time_elapsed     | 11       |
|    total_timesteps  | 48809    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.4     |
|    ep_rew_mean      | 2

In [36]:
import gym
import numpy as np
from stable_baselines3 import PPO

env = TicTacToeEnv()

# Create a DQN model
model = PPO("MlpPolicy", env, verbose=1)

# Train the agent
total_timesteps = 100000
model.learn(total_timesteps=total_timesteps)

# Save the trained model
model.save("ppo_tictactoe")

# Test the trained agent
obs = env.reset()
for _ in range(10):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)  # Pass action as a single value, not in a list
    env.render()
    if done:
        obs = env.reset()

# Close the environment
env.close()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=np.int)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 11.7     |
|    ep_rew_mean     | 3.2      |
| time/              |          |
|    fps             | 1583     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 11.5         |
|    ep_rew_mean          | 1.5          |
| time/                   |              |
|    fps                  | 1207         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0065145735 |
|    clip_fraction        | 0.0291       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.19        |
|    explained_variance   | -0.0046      |
|    learning_r

In [37]:
import gym
import numpy as np
from stable_baselines3 import A2C

env = TicTacToeEnv()

# Create a DQN model
model = A2C("MlpPolicy", env, verbose=1)

# Train the agent
total_timesteps = 100000
model.learn(total_timesteps=total_timesteps)

# Save the trained model
model.save("a2c_tictactoe")

# Test the trained agent
obs = env.reset()
for _ in range(10):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)  # Pass action as a single value, not in a list
    env.render()
    if done:
        obs = env.reset()

# Close the environment
env.close()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=np.int)


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 11.8     |
|    ep_rew_mean        | 5.61     |
| time/                 |          |
|    fps                | 968      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -2.16    |
|    explained_variance | 0.228    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.531   |
|    value_loss         | 0.0652   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 12.3     |
|    ep_rew_mean        | 4.44     |
| time/                 |          |
|    fps                | 996      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

In [None]:
import gym
import numpy as np
from stable_baselines3 import A2C, PPO , DQN
from stable_baselines3.common.callbacks import EvalCallback

env = TicTacToeEnv()

# Create a DQN model
# model_a2c = A2C("MlpPolicy", env, verbose=1)
# model_ppo = PPO("MlpPolicy", env, verbose=1)
model_dqn = DQN("MlpPolicy", env, verbose=1 )
eval_callback = EvalCallback(env, best_model_save_path='./logs/',
                             log_path='./logs/', eval_freq=100,
                             deterministic=True, render=False)

# Train the agent
total_timesteps = 100000
# model_a2c.learn(total_timesteps=total_timesteps, callback=eval_callback)
# model_ppo.learn(total_timesteps=total_timesteps, callback=eval_callback)
model_dqn.learn(total_timesteps=total_timesteps, callback=eval_callback)


# # Save the trained model
# model.save("a2c_tictactoe")

# # Test the trained agent
# obs = env.reset()
# for _ in range(10):
#     action, _ = model.predict(obs)
#     obs, reward, done, _ = env.step(action)  # Pass action as a single value, not in a list
#     env.render()
#     if done:
#         obs = env.reset()

# # Close the environment
# env.close()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 13       |
|    ep_rew_mean      | 5        |
|    exploration_rate | 0.995    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8963     |
|    time_elapsed     | 0        |
|    total_timesteps  | 52       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 12.1     |
|    ep_rew_mean      | 7.5      |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7163     |
|    time_elapsed     | 0        |
|    total_timesteps  | 97       |
----------------------------------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.observation_space = spaces.Box(low=0, high=2, shape=(3, 3), dtype=np.int)


In [None]:
model_dqn.save(DQN_path)

In [None]:
DQN_path

In [None]:
model = DQN.load(DQN_path , env = env)

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy
model__dqn = DQN.load()
evaluate_policy(model__a2c, env , n_eval_episodes= 10, render= True)

In [None]:
%load_ext tensorboard
%tensorboard --logdir=./logs


In [None]:
obs = env.reset()
done = False

while not done:
    env.render()
    player_action = int(input("Enter your move (0-8): "))  # Human player's move
    obs, _, done, _ = env.step(player_action)  # Human player makes a move

    if done:
        env.render()
        break

    print("AI's turn:")
    ai_action, _ = model.predict(obs)  # AI agent's move
    obs, _, done, _ = env.step(ai_action)  # AI agent makes a move

    if done:
        env.render()
        break

# Close the environment
env.close()