In [1]:
# Jupyter Notebook for Training Q-learning Player in Quixo Game

import random
import numpy as np
from game import Game, Move, Player

class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        from_pos = (random.randint(0, 4), random.randint(0, 4))
        move = random.choice([Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT])
        return from_pos, move

class MyPlayer(Player):
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.1) -> None:
        super().__init__()
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.q_values = {}

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        state = self.get_state_representation(game)
        
        if np.random.rand() < self.exploration_prob:
            from_pos = (random.randint(0, 4), random.randint(0, 4))
            move = random.choice([Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT])
        else:
            valid_moves = self.get_valid_moves(game)
            action_values = self.q_values.get(state, {action: 0 for action in valid_moves})
            best_action = max(action_values, key=action_values.get)
            from_pos, move = self.decode_action(best_action)

        return from_pos, move

    def get_state_representation(self, game: 'Game') -> str:
        return str(game.get_board().flatten().tolist() + [game.get_current_player()])

    def encode_action(self, from_pos: tuple[int, int], move: Move) -> str:
        return f"{from_pos[0]},{from_pos[1]},{move.value}"

    def decode_action(self, action: str) -> tuple[tuple[int, int], Move]:
        parts = action.split(',')
        return (int(parts[0]), int(parts[1])), Move(int(parts[2]))

    def get_valid_moves(self, game: 'Game') -> list[str]:
        valid_moves = []
        for x in range(5):
            for y in range(5):
                for move in [Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT]:
                    if game._board[x, y] == -1 or (x == 0 and move == Move.TOP) or (x == 4 and move == Move.BOTTOM) or \
                       (y == 0 and move == Move.LEFT) or (y == 4 and move == Move.RIGHT):
                        valid_moves.append(self.encode_action((x, y), move))
        return valid_moves

    def update_q_values(self, state: str, action: str, reward: int, next_state: str) -> None:
        current_q_value = self.q_values.get(state, {}).get(action, 0)
        max_next_q_value = max(self.q_values.get(next_state, {}).values(), default=0)
        new_q_value = (1 - self.learning_rate) * current_q_value + self.learning_rate * (reward + self.discount_factor * max_next_q_value)
        
        if state not in self.q_values:
            self.q_values[state] = {}
        self.q_values[state][action] = new_q_value

# Training the Q-learning player
def train_q_learning_player(player, num_episodes=1000):
    for _ in range(num_episodes):
        g = Game()
        player.q_values = {}  # Reset Q-values for each episode
        winner = g.play(player, RandomPlayer())
        if winner == 1:
            reward = 1  # Positive reward for winning
        else:
            reward = -1  # Negative reward for losing
        player.update_q_values("", "", reward, "")  # Update Q-values based on the outcome of the episode



In [2]:
#training:
my_player = MyPlayer()
train_q_learning_player(my_player, num_episodes=1000)


In [3]:
# Assuming my_player has already been created and trained in Cell 2

num_games = 100
wins = 0

for _ in range(num_games):
    g = Game()
    winner = g.play(my_player, RandomPlayer())
    if winner == 0:
        wins += 1

winning_rate = wins / num_games
print(f"Winning Rate: {winning_rate * 100:.2f}%")


Winning Rate: 51.00%
