In [1]:
import random
from collections import deque
import numpy as np
import copy
import tensorflow as tf
from tensorflow.keras import layers, models

In [2]:
class checkBoard:
    board = [[0, 0, 0],[0, 0, 0],[0, 0, 0]]
    available_places = [1, 2, 3, 4, 5, 6, 7, 8, 9]

    def check_winner(self, player):
    # Check rows, columns and diagonals
        for i in range(3):
            if all([cell == player for cell in self.board[i]]) or \
            all([self.board[j][i] == player for j in range(3)]):
                return True
        if all([self.board[i][i] == player for i in range(3)]) or \
        all([self.board[i][2 - i] == player for i in range(3)]):
            return True
        return False

    def toNumpyArr(self):
        return np.array(self.board).reshape(1, -1)

    def game_end_state(self):
        winner_one = self.check_winner(1)
        winner_two = self.check_winner(2)
        no_available_spaces = (len(self.available_places) == 0)
        game_state = {}
        game_state["winner"] = 1 if winner_one else 2 if winner_two else 0
        game_state["game_over"] = no_available_spaces or winner_one or winner_two
        game_state["tie"] = no_available_spaces and not winner_one and not winner_two
        game_state["reward_player_two"] = -1 if winner_one else 1 if winner_two else 0
        return game_state

    def reset_board(self):
        self.board = [[0, 0, 0],[0, 0, 0],[0, 0, 0]]
        self.available_places = [1, 2, 3, 4, 5, 6, 7, 8, 9]

    def board_state(self): # Added self to access instance variables
        return self.board, self.available_places

    def print_board(self):
        for row in self.board:
            print(" | ".join("".join(map(str, row)).replace("0"," ").replace("2","X").replace("1","O")))
            print("-" * 5)

    def move(self, move, current_player = 2):
        if not 1 <= int(move) <= 9:
            print("Invalid input. Please enter a number between 1 and 9.")
            return False

        move = int(move) - 1
        row, col = divmod(move, 3)

        if self.board[row][col] > 0:
            print("That spot is already taken. Try again.")
            return False
        else:
            self.board[row][col] = current_player
            self.available_places.remove(move + 1)
            return True

    def move_random(self, current_player ):
        if not self.available_places:
            return None
        move_place = random.choice(self.available_places)
        self.move(move_place, current_player)
        return move_place

def ext_print_board(self):
    for row in self.board:
        print(" | ".join("".join(map(str, row)).replace("0"," ").replace("2","X").replace("1","O")))
        print("-" * 5)

In [3]:
NUM_ACTIONS = 9
STATE_SIZE = 9
EMPTY = 0
PLAYER = 1
OPPONENT = -1

# Hyperparameters
GAMMA = 0.99
EPSILON = 0.1
ALPHA = 0.001
BATCH_SIZE = 3200
MEMORY_SIZE = 10000

In [5]:
def create_dqn_model(input_shape=(9,), num_actions=9):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_actions, activation='linear')  # Q-values for each action
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse')
    return model

In [4]:
def choose_action(state, model, epsilon=EPSILON):
    valid_actions = [i for i in range(NUM_ACTIONS) if state[i] == EMPTY]
    if np.random.rand() < epsilon:
        return random.choice(valid_actions)

    q_values = model.predict(state, verbose=0)[0]
    masked_q = np.full(NUM_ACTIONS, -np.inf)
    for a in valid_actions:
        masked_q[a] = q_values[a]
    return np.argmax(masked_q)

In [6]:
def remember(memory, state, action, reward, next_state, done):
    memory.append((np.array(state).reshape(1, -1), action, reward, np.array(next_state).reshape(1, -1), done))

def create_data(MEMORY_SIZE):
    memory = deque(maxlen=MEMORY_SIZE)
    for i in range(MEMORY_SIZE):
        checkBoardGame = checkBoard()
        checkBoardGame.reset_board()
        game_state = checkBoardGame.game_end_state()
        turn = True

        while not(game_state["game_over"]):
            last_checkBoardState = copy.deepcopy(checkBoardGame.board_state()[0])
            turn = not(turn)
            move_made = None
            if(turn):
                move_made = checkBoardGame.move_random(2)
            else:
                checkBoardGame.move_random(1)
            game_state = checkBoardGame.game_end_state()
            if(move_made != None):
                remember(memory, last_checkBoardState, move_made, game_state["reward_player_two"], copy.deepcopy(checkBoardGame.board_state()[0]), game_state["game_over"])

    # checkBoardGame.print_board()
    # print(game_state["winner"])
    return memory

In [7]:
def train_dqn(model, memory, target_model):
    if len(memory) < BATCH_SIZE:
        return

    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = np.array(states).reshape(BATCH_SIZE, -1)
    next_states = np.array(next_states).reshape(BATCH_SIZE, -1)

    targets = model.predict(states, verbose=0)
    next_q = target_model.predict(next_states, verbose=0)

    for i in range(BATCH_SIZE):
        if dones[i]:
            targets[i][actions[i] -1 ] = rewards[i] #Subtract 1 from actions[i] since it's using 1-based indexing
        else:
            valid_next_actions = [a for a in range(NUM_ACTIONS) if next_states[i][a] == EMPTY]
            #Subtract 1 from the values in valid_next_actions
            valid_next_actions = [ a - 1 for a in valid_next_actions]
            #Check if valid_next_actions is empty
            if not valid_next_actions:
                max_future_q = 0 # or any default value if there are no valid actions
            else:
                max_future_q = max(next_q[i][a] for a in valid_next_actions)
            targets[i][actions[i] - 1] = rewards[i] + GAMMA * max_future_q #Subtract 1 from actions[i]

    model.fit(states, targets, epochs=1, verbose=0)

In [8]:
dqn_model = create_dqn_model()
memory = create_data(BATCH_SIZE)

In [9]:
train_dqn(dqn_model, memory, dqn_model)

In [16]:
checkBoardB = checkBoard();
checkBoardB.reset_board()
turn = False

game_state = checkBoardB.game_end_state()
while not(game_state["game_over"]):
    turn = not(turn)
    if(turn):
        dqn_model.predict(checkBoardB.toNumpyArr())
        print(max_index)
        validMove = checkBoardB.move(max_index, 2)
        if(not validMove):
            raise Exception("Invalid move")
            break
    else:
        checkBoardB.move_random(1)
    game_state = checkBoardB.game_end_state()
    checkBoardB.print_board()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
7
  |   |  
-----
  |   |  
-----
X |   |  
-----
  |   |  
-----
  |   | O
-----
X |   |  
-----
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
5
  |   |  
-----
  | X | O
-----
X |   |  
-----
  |   |  
-----
  | X | O
-----
X |   | O
-----
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
5
That spot is already taken. Try again.


Exception: Invalid move

In [18]:
checkBoardB = checkBoard();


  |   |  
-----
  |   |  
-----
  |   |  
-----


In [20]:
checkBoardB.move(1)

True

In [21]:
checkBoardB.print_board()

X |   |  
-----
  |   |  
-----
  |   |  
-----
