In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
import random
import pickle
import numpy as np


In [10]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1
    
    def reset(self):
        self.board.fill(0)
        self.current_player = 1
    
    def make_move(self, row, col):
        if self.board[row, col] == 0:
            self.board[row, col] = self.current_player
            return True
        return False
    
    def check_winner(self):
        for player in [-1, 1]:
            if any(np.all(self.board[i, :] == player) for i in range(3)) or \
               any(np.all(self.board[:, i] == player) for i in range(3)) or \
               np.all(np.diag(self.board) == player) or \
               np.all(np.diag(np.fliplr(self.board)) == player):
                return player
        if np.all(self.board != 0):
            return 0 
        return None

    def switch_player(self):
        self.current_player = -1*self.current_player
    
    def print_board(self):
        symbols = {0: ".", 1: "X", -1: "O"}
        print("\n".join(" ".join(symbols[cell] for cell in row) for row in self.board))

In [11]:

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(9, 18)  # Input layer (9 for the board) to hidden layer
        self.fc2 = nn.Linear(18, 9)   # Hidden layer to output layer (9 possible moves)
        

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)


In [15]:

def train_model(model, games, optimizer, criterion, episodes=100, epsilon=0.1):
    for episode in range(episodes):
        for state, action, reward in games:
            state_tensor = torch.FloatTensor(state).view(1, -1)
            target = model(state_tensor).detach()
            target[0][action] = reward  # Update Q-value for the taken action

            optimizer.zero_grad()
            output = model(state_tensor)
            loss = criterion(output, target)
            print(f"Loss in episode {episode} is {loss}")
            loss.backward()
            optimizer.step()


In [16]:

# Self-play logic
def self_play(model, num_games=30):
    games = []
    for _ in range(num_games):
        game = TicTacToe()
        while True:
            state = game.board.flatten().tolist()
            if game.current_player == 1:
                q_values = model(torch.FloatTensor(state)).detach().numpy()
                action = np.argmax(q_values) if random.random() > 0.1 else random.choice(np.flatnonzero(game.board.flatten() == 0))
            else:
                # Random move for player 2 for simplicity
                action = random.choice(np.flatnonzero(game.board.flatten() == 0))

            row, col = divmod(action, 3)
            game.make_move(row, col)
            reward = game.check_winner()
            if reward is not None:
                if reward == 1:  # Player 1 wins
                    games.append((state, action, 1))
                elif reward == 2:  # Player 2 wins
                    games.append((state, action, -1))
                else:  # Draw
                    games.append((state, action, 0))
                break

            game.switch_player()
            # Store the state-action pair
            games.append((state, action, 0))  # Temporary reward until the game ends
    return games

In [17]:
model = MLP()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
    
games = self_play(model, num_games=30)
train_model(model, games, optimizer, criterion, episodes=10)

# Test the trained model
game = TicTacToe()
while True:
    game.print_board()
    if game.current_player == 1:
        state = game.board.flatten().tolist()
        q_values = model(torch.FloatTensor(state)).detach().numpy()
        action = np.argmax(q_values)
        row, col = divmod(action, 3)
    else:
        action = random.choice(np.flatnonzero(game.board.flatten() == 0))
        row, col = divmod(action, 3)

    game.make_move(row, col)
    if game.check_winner() is not None:
        game.print_board()
        print("Winner:", game.check_winner())
        break
    game.switch_player()


Loss in episode 0 is 0.006645164918154478
Loss in episode 0 is 0.00022522496874444187
Loss in episode 0 is 0.011672320775687695
Loss in episode 0 is 0.0004374891286715865
Loss in episode 0 is 0.013893859460949898
Loss in episode 0 is 0.00376448524184525
Loss in episode 0 is 0.009990954771637917
Loss in episode 0 is 0.008385694585740566
Loss in episode 0 is 0.005515183787792921
Loss in episode 0 is 0.00013974460307508707
Loss in episode 0 is 0.015904057770967484
Loss in episode 0 is 0.0015389396576210856
Loss in episode 0 is 0.006316233891993761
Loss in episode 0 is 0.006508302874863148
Loss in episode 0 is 0.005640496499836445
Loss in episode 0 is 0.00034117919858545065
Loss in episode 0 is 0.004071825183928013
Loss in episode 0 is 0.012234684079885483
Loss in episode 0 is 0.041189536452293396
Loss in episode 0 is 0.004503183998167515
Loss in episode 0 is 0.004355971235781908
Loss in episode 0 is 0.0016633798368275166
Loss in episode 0 is 0.0192495658993721
Loss in episode 0 is 0.00064

In [30]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Set up the Tic-Tac-Toe environment
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)  # 3x3 board
        self.current_player = 1  # Player 1 starts

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.board.flatten()

    def step(self, action):
        row, col = divmod(action, 3)
        if self.board[row, col] == 0:
            self.board[row, col] = self.current_player
            reward, done = self.check_game_state()
            self.current_player = -self.current_player  # Switch player
            return self.board.flatten(), reward, done
        else:
            return self.board.flatten(), -10, False  # Invalid move penalty

    def check_game_state(self):
        # Check rows, columns, diagonals for a win
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3 or abs(sum(self.board[:, i])) == 3:
                return 1, True
        if abs(self.board.trace()) == 3 or abs(np.fliplr(self.board).trace()) == 3:
            return 1, True
        if not np.any(self.board == 0):  # Draw
            return 0.5, True
        return 0, False  # Continue

# Define the MLP model for the RL agent
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(9, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 9)
        )

    def forward(self, x):
        return self.layers(x)

# Q-learning with self-play
class QLearningAgent:
    def __init__(self, model, lr=0.001, gamma=0.9, epsilon=0.1):
        self.model = model
        self.optimizer = optim.Adam(model.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()
        self.gamma = gamma
        self.epsilon = epsilon

    def choose_action(self, state):
        if random.random() < self.epsilon:  # Explore
            return random.choice([i for i, x in enumerate(state) if x == 0])
        else:  # Exploit
            with torch.no_grad():
                q_values = self.model(torch.FloatTensor(state))
            q_values[state != 0] = -float('inf')  # Mask invalid moves
            return q_values.argmax().item()

    def update(self, state, action, reward, next_state, done):
        self.optimizer.zero_grad()
        q_values = self.model(torch.FloatTensor(state))
        target_q_values = q_values.clone().detach()
        next_q_value = self.model(torch.FloatTensor(next_state)).max().item()
        
        if done:
            target_q_values[action] = reward
        else:
            target_q_values[action] = reward + self.gamma * next_q_value

        loss = self.loss_fn(q_values, target_q_values)
        
        loss.backward()
        self.optimizer.step()
        return loss

# Training loop with self-play
def train_self_play(num_episodes=10):
    env = TicTacToe()
    model = MLP()
    agent1 = QLearningAgent(model)
    agent2 = QLearningAgent(model)  # Agent 2 can be the same model in self-play

    for episode in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            action1 = agent1.choose_action(state)
            next_state, reward, done = env.step(action1)
            agent1.update(state, action1, reward, next_state, done)
            state = next_state
            
            if done: break

            action2 = agent2.choose_action(state)
            next_state, reward, done = env.step(action2)
            loss = agent2.update(state, action2, -reward, next_state, done)  # Reverse reward for player 2
            print(f"Loss after {episode+1} episodes = {loss}")
            state = next_state
    
    return model, agent1, agent2

model, agent, _ =  train_self_play()


Loss after 1 episodes = 9.316454452346079e-06
Loss after 1 episodes = 0.0015216395258903503
Loss after 1 episodes = 7.319499854929745e-05
Loss after 1 episodes = 0.0030293120071291924
Loss after 2 episodes = 4.3247780467936536e-07
Loss after 2 episodes = 0.00011930424807360396
Loss after 2 episodes = 0.0010923356749117374
Loss after 3 episodes = 9.120473987422884e-05
Loss after 3 episodes = 0.0004205180157441646
Loss after 3 episodes = 0.004507248755544424
Loss after 3 episodes = 0.0015313895419239998
Loss after 4 episodes = 2.6358857212471776e-06
Loss after 4 episodes = 1.3017472156207077e-05
Loss after 4 episodes = 0.0019664056599140167
Loss after 4 episodes = 0.006467882543802261
Loss after 5 episodes = 0.0004816065775230527
Loss after 5 episodes = 0.0006575168808922172
Loss after 5 episodes = 0.001357520930469036
Loss after 5 episodes = 0.003865249454975128
Loss after 6 episodes = 0.00010207647574134171
Loss after 6 episodes = 3.451931843301281e-05
Loss after 6 episodes = 0.0001433

In [31]:
model

MLP(
  (layers): Sequential(
    (0): Linear(in_features=9, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=9, bias=True)
  )
)

In [33]:
env = TicTacToe()
def display_board(board):
    """Displays the Tic-Tac-Toe board in a user-friendly way."""
    symbols = {1: "X", -1: "O", 0: " "}
    print("\nBoard:")
    for row in board:
        print(" | ".join(symbols[cell] for cell in row))
        print("-" * 9)

def get_human_move(board):
    """Get a valid move from the human player."""
    while True:
        try:
            move = int(input("Enter your move (0-8): "))
            if 0 <= move <= 8 and board.flatten()[move] == 0:
                return move
            else:
                print("Invalid move! Try again.")
        except ValueError:
            print("Please enter a number between 0 and 8.")

# Game loop
state = env.reset()
done = False
print("Welcome to Tic-Tac-Toe! You are 'X', and the model is 'O'.")
display_board(env.board)

while not done:
    # Human turn
    human_move = get_human_move(env.board)
    _, reward, done = env.step(human_move)  # Human move
    display_board(env.board)
    if done:
        if reward == 1:
            print("Congratulations! You win!")
        elif reward == 0.5:
            print("It's a draw!")
        break

    # Model's turn
    state = env.board.flatten()
    model_move = agent.choose_action(state)
    _, reward, done = env.step(model_move)  # Model move
    print("Model's move:")
    display_board(env.board)
    if done:
        if reward == 1:
            print("Model wins! Better luck next time.")
        elif reward == 0.5:
            print("It's a draw!")
        break

Welcome to Tic-Tac-Toe! You are 'X', and the model is 'O'.

Board:
  |   |  
---------
  |   |  
---------
  |   |  
---------

Board:
  |   |  
---------
  | X |  
---------
  |   |  
---------
Model's move:

Board:
  |   |  
---------
  | X |  
---------
O |   |  
---------

Board:
X |   |  
---------
  | X |  
---------
O |   |  
---------
Model's move:

Board:
X |   |  
---------
  | X | O
---------
O |   |  
---------

Board:
X |   |  
---------
  | X | O
---------
O |   | X
---------
Congratulations! You win!
