In [7]:
import numpy as np
import random

# Define the TicTacToe game
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))  # 0 for empty, 1 for player 1, -1 for player 2
        self.current_player = 1  # Player 1 starts

    def reset(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1

    def make_move(self, row, col):
        if self.board[row][col] == 0:
            self.board[row][col] = self.current_player
            return True
        return False

    def check_winner(self):
        for player in [1, -1]:
            if any(np.all(self.board[i, :] == player) for i in range(3)) or \
               any(np.all(self.board[:, i] == player) for i in range(3)) or \
               np.all(np.diag(self.board) == player) or \
               np.all(np.diag(np.fliplr(self.board)) == player):
                return player
        if np.all(self.board != 0):
            return 0  # Draw
        return None  # Game continues

    def available_moves(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i][j] == 0]

    def switch_player(self):
        self.current_player *= -1  # Switch between 1 and -1

    def print_board(self):
        print(self.board)

# Define the Q-learning agent
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_rate=1.0, exploration_decay=0.99):
        self.q_table = {}
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay

    def get_q_value(self, state, action):
        return self.q_table.get((str(state), action), 0.0)

    def choose_action(self, state, available_moves):
        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(available_moves)  # Explore
        else:
            q_values = [self.get_q_value(state, move) for move in available_moves]
            max_q_value = max(q_values)
            max_actions = [move for move, q in zip(available_moves, q_values) if q == max_q_value]
            return random.choice(max_actions)  # Exploit

    def learn(self, state, action, reward, next_state, available_moves):
        old_value = self.get_q_value(state, action)
        future_rewards = max([self.get_q_value(next_state, a) for a in available_moves])
        new_value = old_value + self.learning_rate * (reward + self.discount_factor * future_rewards - old_value)
        self.q_table[(str(state), action)] = new_value

    def decay_exploration(self):
        self.exploration_rate *= self.exploration_decay

# Training function
def train_agent(episodes=10000):
    agent = QLearningAgent()
    game = TicTacToe()
    
    for episode in range(episodes):
        game.reset()
        state = game.board.copy()
        done = False
        
        while not done:
            available_moves = game.available_moves()
            action = agent.choose_action(state, available_moves)
            game.make_move(*action)
            winner = game.check_winner()

            if winner == 1:  # Player 1 wins
                reward = 1
                done = True
            elif winner == -1:  # Player 2 wins
                reward = -1
                done = True
            elif winner == 0:  # Draw
                reward = 0.5
                done = True
            else:
                reward = 0

            next_state = game.board.copy()
            agent.learn(state, action, reward, next_state, available_moves)  # Pass available_moves here
            state = next_state
            game.switch_player()

        agent.decay_exploration()
    return agent

# Testing function
def test_agent(agent, test_episodes=1000):
    game = TicTacToe()
    wins, draws, losses = 0, 0, 0

    for episode in range(test_episodes):
        game.reset()
        state = game.board.copy()
        done = False
        
        while not done:
            available_moves = game.available_moves()
            action = agent.choose_action(state, available_moves)
            game.make_move(*action)
            winner = game.check_winner()

            if winner == 1:  # Agent wins
                wins += 1
                done = True
            elif winner == -1:  # Random player wins
                losses += 1
                done = True
            elif winner == 0:  # Draw
                draws += 1
                done = True
            else:
                reward = 0

            next_state = game.board.copy()
            state = next_state
            game.switch_player()

    print(f"Wins: {wins}, Losses: {losses}, Draws: {draws}")

# Running the training and testing
trained_agent = train_agent(episodes=10000)
test_agent(trained_agent)


Wins: 1000, Losses: 0, Draws: 0
