In [None]:
# To run the code:
# pip install numpy

import numpy as np
import random
from collections import defaultdict

# ----- 1Ô∏è‚É£ Game Environment -----
def init_board():
    return [" "] * 9  # 3x3 board

def print_board(board):
    print(f"{board[0]} | {board[1]} | {board[2]}")
    print("--+---+--")
    print(f"{board[3]} | {board[4]} | {board[5]}")
    print("--+---+--")
    print(f"{board[6]} | {board[7]} | {board[8]}\n")

def available_actions(board):
    return [i for i, cell in enumerate(board) if cell == " "]

def check_winner(board):
    wins = [
        [0,1,2], [3,4,5], [6,7,8],
        [0,3,6], [1,4,7], [2,5,8],
        [0,4,8], [2,4,6]
    ]
    for combo in wins:
        if board[combo[0]] != " " and \
           board[combo[0]] == board[combo[1]] == board[combo[2]]:
            return board[combo[0]]
    if " " not in board:
        return "Draw"
    return None

# ----- 2Ô∏è‚É£ Q-Learning Setup -----
Q = defaultdict(lambda: np.zeros(9))
alpha = 0.3        # learning rate
gamma = 0.9        # discount factor
epsilon = 1.0      # exploration rate
epsilon_decay = 0.001

def get_state(board):
    return "".join(board)

def choose_action(state, board):
    if random.uniform(0,1) < epsilon:
        return random.choice(available_actions(board))
    else:
        state_Q = Q[state]
        # pick best valid action
        valid = available_actions(board)
        valid_Q = [(a, state_Q[a]) for a in valid]
        return max(valid_Q, key=lambda x: x[1])[0]

# ----- 3Ô∏è‚É£ Play One Episode -----
def play_episode(train=True):
    global epsilon
    board = init_board()
    current_player = "X"
    state_history = []
    reward = 0

    while True:
        state = get_state(board)
        action = choose_action(state, board) if current_player == "X" else random.choice(available_actions(board))
        board[action] = current_player

        winner = check_winner(board)
        next_state = get_state(board)

        if winner == "X":
            reward = 1
            if train:
                Q[state][action] = Q[state][action] + alpha * (reward - Q[state][action])
            break
        elif winner == "O":
            reward = -1
            if train:
                Q[state][action] = Q[state][action] + alpha * (reward - Q[state][action])
            break
        elif winner == "Draw":
            reward = 0
            if train:
                Q[state][action] = Q[state][action] + alpha * (reward - Q[state][action])
            break
        else:
            if train and current_player == "X":
                best_future = np.max(Q[next_state])
                Q[state][action] = Q[state][action] + alpha * (reward + gamma * best_future - Q[state][action])

        current_player = "O" if current_player == "X" else "X"

    if train:
        epsilon = max(0.1, epsilon - epsilon_decay)

# ----- 4Ô∏è‚É£ Training -----
for episode in range(50000):
    play_episode(train=True)
    if (episode+1) % 10000 == 0:
        print(f"Episode {episode+1} completed, epsilon={epsilon:.3f}")

print("\nTraining complete!")

# ----- 5Ô∏è‚É£ Play Against Trained Agent -----
def agent_move(board):
    state = get_state(board)
    valid = available_actions(board)
    best = np.argmax(Q[state])
    if best not in valid:
        best = random.choice(valid)
    return best

board = init_board()
print("\nLet's play Tic-Tac-Toe!")
print_board(board)

while True:
    # Human move
    move = int(input("Enter your move (0-8): "))
    if board[move] != " ":
        print("Invalid move, try again.")
        continue
    board[move] = "O"

    if check_winner(board):
        print_board(board)
        print("You win! üòÑ")
        break

    # Agent move
    agent_act = agent_move(board)
    board[agent_act] = "X"

    print_board(board)
    winner = check_winner(board)
    if winner:
        if winner == "X":
            print("Agent wins! ü§ñ")
        else:
            print("It's a draw.")
        break
