In [57]:
import random
from collections import deque
import numpy as np
import copy
import tensorflow as tf
from tqdm import tqdm

In [79]:
class checkBoard:
    board = np.zeros(9)
    available_places = [0, 1, 2, 3, 4, 5, 6, 7, 8]

    def reset_board(self):
        self.board = np.zeros(9)
        self.available_places = [0, 1, 2, 3, 4, 5, 6, 7, 8]

    def board_state(self):
        return np.array(self.board).reshape(1, -1)

    def check_winner(self, board):
        wins = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # columns
            [0, 4, 8], [2, 4, 6]              # diagonals
        ]
        for a, b, c in wins:
            if self.board[a] == self.board[b] == self.board[c] and self.board[a] in (1, 2):
                return board[a]
        return None

    def game_end_state(self):
        no_available_spaces = (len(self.available_places) == 0)
        game_state = {}

        game_state["winner"] = self.check_winner(self.board)
        game_state["game_over"] = no_available_spaces or game_state["winner"]
        return game_state

    def print_board(self):
        print("\n")
        for i in range(0, 9, 3):
            row = self.board[i:i+3]
            print(' | '.join({1:"O", 2:"X"}[cell] if cell in (1, 2) else ' ' for cell in row))
            if i < 6:
                print('---------')


    def move(self, move, current_player = "X"):
        if move not in self.available_places:
            print("Invalid move. Try again.")
            return False
        else:
            self.board[move] = current_player
            self.available_places.remove(move)
            return True

    def move_random(self, current_player = "O"):
        if not self.available_places:
            return None
        move_place = random.choice(self.available_places)
        self.move(move_place, current_player)
        return move_place

In [88]:
checkBoardX = checkBoard()

wins = {1:0,2:0,None:0}
for i in tqdm(range(100)):
    checkBoardX.reset_board()
    turn = random.choice([True, False])
    game_state = checkBoardX.game_end_state()
    while not(game_state["game_over"]):
        turn = not(turn)
        if(turn):
            # print(checkBoardX.available_places)
            # move = choose_action(dqn_model, checkBoardX, 0)
            # print(move)
            # checkBoardX.move(move, 2)
            checkBoardX.move_random(2)
        else:
            checkBoardX.move_random(1)
        game_state = checkBoardX.game_end_state()
    wins[game_state["winner"]]+=1
print(wins)

100%|██████████| 100/100 [00:00<00:00, 12058.14it/s]

{1: 48, 2: 41, None: 11}





In [50]:
NUM_ACTIONS = 9
STATE_SIZE = 9
EMPTY = 0
PLAYER = 2
OPPONENT = 1

# Hyperparameters
GAMMA = 0.99
EPSILON = 0.1
ALPHA = 0.001
BATCH_SIZE = 32
MEMORY_SIZE = 10000

In [89]:
def create_dqn_model(input_shape=(9,), num_actions=9):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_actions, activation='linear')  # Q-values for each action
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse')
    return model

In [52]:
def remember(memory, state, action, reward, next_state, done):
    memory.append((np.array(state).reshape(1, -1), action, reward, np.array(next_state).reshape(1, -1), done))

def create_data(MEMORY_SIZE):
    memory = deque(maxlen=MEMORY_SIZE)
    for i in range(MEMORY_SIZE):
        checkBoardGame = checkBoard()
        checkBoardGame.reset_board()
        game_state = checkBoardGame.game_end_state()
        turn = True

        while not(game_state["game_over"]):
            last_checkBoardState = copy.deepcopy(checkBoardGame.board_state()[0])
            turn = not(turn)
            move_made = None
            if(turn):
                move_made = checkBoardGame.move_random(2)
            else:
                checkBoardGame.move_random(1)
            game_state = checkBoardGame.game_end_state()
            if(move_made != None):
                remember(memory, last_checkBoardState, move_made, game_state["reward_player_two"], copy.deepcopy(checkBoardGame.board_state()[0]), game_state["game_over"])

    # checkBoardGame.print_board()
    # print(game_state["winner"])
    return memory

In [53]:
def train_dqn(model, memory, target_model):
    if len(memory) < BATCH_SIZE:
        return

    batch = random.sample(memory, BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = np.array(states).reshape(BATCH_SIZE, -1)
    next_states = np.array(next_states).reshape(BATCH_SIZE, -1)

    targets = model.predict(states, verbose=0)
    next_q = target_model.predict(next_states, verbose=0)

    for i in range(BATCH_SIZE):
        if dones[i]:
            targets[i][actions[i]] = rewards[i] #Subtract 1 from actions[i] since it's using 1-based indexing
        else:
            valid_next_actions = [a for a in range(NUM_ACTIONS) if next_states[i][a] == EMPTY]
            #Subtract 1 from the values in valid_next_actions
            valid_next_actions = [a for a in valid_next_actions]
            #Check if valid_next_actions is empty
            if not valid_next_actions:
                max_future_q = 0 # or any default value if there are no valid actions
            else:
                max_future_q = max(next_q[i][a] for a in valid_next_actions)
            targets[i][actions[i]] = rewards[i] + GAMMA * max_future_q #Subtract 1 from actions[i]

    model.fit(states, targets, epochs=1, verbose=0)

In [54]:
def choose_action(model, board, epsilon=EPSILON):
    if np.random.rand() < epsilon:
        return random.choice(board.available_places)

    q_values = model.predict(board.board_state(), verbose=0)[0]
    masked_q = np.full(NUM_ACTIONS, -np.inf)
    for a in board.available_places:
        masked_q[a] = q_values[a]
    return np.argmax(masked_q)

In [55]:
dqn_model = create_dqn_model()
target_model = create_dqn_model()
memory = create_data(MEMORY_SIZE)

for i in tqdm(range(100)):
    train_dqn(dqn_model, memory, target_model)
    target_model.set_weights(dqn_model.get_weights())

100%|██████████| 100/100 [00:37<00:00,  2.69it/s]


In [56]:
checkBoardX = checkBoard()

wins = [0,0,0]
pbar = tqdm(range(1000))

for i in pbar:
    checkBoardX.reset_board()
    turn = random.choice([True, False])
    game_state = checkBoardX.game_end_state()
    while not(game_state["game_over"]):
        turn = not(turn)
        if(turn):
            # print(checkBoardX.available_places)
            move = choose_action(dqn_model, checkBoardX, 0)
            # print(move)
            checkBoardX.move(move, 2)
            # checkBoardX.move_random(2)
        else:
            checkBoardX.move_random(1)
        game_state = checkBoardX.game_end_state()
    wins[game_state["winner"]] += 1
    pbar.set_postfix({'wins': wins})
print("\nPlayer 1 :" , wins[1])
print("Player 2 :" , wins[2])
print("Tie :" , wins[0])

100%|██████████| 1000/1000 [09:02<00:00,  1.84it/s, wins=[285, 381, 334]]


Player 1 : 381
Player 2 : 334
Tie : 285



