In [None]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers

# Constants for Tic-Tac-Toe
EMPTY, PLAYER_X, PLAYER_O = 0, 1, -1
BOARD_SIZE = 9

# Convert board state to input for the model
def board_to_input(board):
    return np.array(board).reshape(1, BOARD_SIZE)

# Check if the current player has won
def check_winner(board):
    win_conditions = [(0, 1, 2), (3, 4, 5), (6, 7, 8),  # Rows
                      (0, 3, 6), (1, 4, 7), (2, 5, 8),  # Columns
                      (0, 4, 8), (2, 4, 6)]             # Diagonals
    for wc in win_conditions:
        if board[wc[0]] == board[wc[1]] == board[wc[2]] != EMPTY:
            return board[wc[0]]
    return 0 if EMPTY in board else None  # Return None if draw

# Get available moves
def available_moves(board):
    return [i for i, spot in enumerate(board) if spot == EMPTY]

# Make a move for a player
def make_move(board, move, player):
    board[move] = player

# Epsilon-greedy strategy for exploration
def choose_move(model, board, epsilon=0.1):
    if random.random() < epsilon:  # Exploration: random move
        return random.choice(available_moves(board))
    else:  # Exploitation: use the model to predict the best move
        input_data = board_to_input(board)
        predictions = model.predict(input_data, verbose=0)
        move = np.argmax(predictions[0])
        return move

# Simulate a game of self-play with epsilon-greedy exploration
def simulate_game(model, epsilon=0.1):
    board = [EMPTY] * BOARD_SIZE
    current_player = PLAYER_X
    game_history = []

    while True:
        if current_player == PLAYER_X:
            move = choose_move(model, board, epsilon)
        else:
            move = random.choice(available_moves(board))  # Random move for Player O

        if board[move] != EMPTY:
            move = random.choice(available_moves(board))  # Invalid move, choose randomly

        game_history.append((board.copy(), move, current_player))
        make_move(board, move, current_player)

        winner = check_winner(board)
        if winner or winner is None:
            return game_history, winner

        current_player = PLAYER_X if current_player == PLAYER_O else PLAYER_O

# Create the neural network model with a deeper architecture
def create_model():
    model = tf.keras.Sequential([
        layers.InputLayer(input_shape=(BOARD_SIZE,)),
        layers.Dense(256, activation='relu'),  # Increased neurons
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(BOARD_SIZE, activation='softmax')  # Output probabilities for each position
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Collect games for training
def collect_games(model, num_games=100, epsilon=0.1):
    games = []

    for _ in range(num_games):
        game_history, winner = simulate_game(model, epsilon)
        games.append((game_history, winner))  # Collect all games, both winning and losing

    return games

# Prepare training data from games with rewards for winning and losing
def prepare_training_data(games):
    boards = []
    next_moves = []
    rewards = []

    for game, winner in games:
        for i in range(len(game) - 1):  # Ignore the last move since there is no next move to predict
            board, move, player = game[i]
            next_move = game[i + 1][1]  # The next move in the game history

            if player == PLAYER_X:
                # Reward for winning moves
                reward = 1 if winner == PLAYER_X else (-1 if winner == PLAYER_O else 0)
                boards.append(board.copy())
                next_moves.append(next_move)
                rewards.append(reward)

    return np.array(boards), np.array(next_moves), np.array(rewards)

# Train the model using both winning and losing games
def train_model_on_games(model, games):
    boards, next_moves, rewards = prepare_training_data(games)

    # Train the model, using rewards as sample weights
    model.fit(boards, next_moves, sample_weight=rewards, epochs=10, verbose=1)

# Predict the next move for a given board state
def predict_next_move(model, board):
    input_data = board_to_input(board)
    predictions = model.predict(input_data, verbose=0)
    move = np.argmax(predictions[0])
    return move

# Iterative learning process with improvements
def iterative_learning(model, iterations=5, games_per_iteration=100, test_board=None, epsilon=0.1):
    for iteration in range(iterations):
        print(f"\nIteration {iteration + 1}/{iterations}")

        # Simulate a batch of games and collect both winning and losing games
        games = collect_games(model, num_games=games_per_iteration, epsilon=epsilon)

        # Train the model on the collected games
        train_model_on_games(model, games)

        # Test the model on the sample test board
        if test_board is not None:
            predicted_move = predict_next_move(model, test_board)
            print(f"Test board state: {test_board}")
            print(f"Predicted next move: {predicted_move}\n")

# Example test board (can be modified)
test_board = [PLAYER_X, PLAYER_O, EMPTY, PLAYER_X, EMPTY, PLAYER_O, EMPTY, EMPTY, EMPTY]

# Main script
if __name__ == '__main__':
    model = create_model()

    # Perform iterative learning with a set number of iterations and epsilon-greedy exploration
    iterative_learning(model, iterations=5, games_per_iteration=100, test_board=test_board, epsilon=0.1)



Iteration 1/5
Epoch 1/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0734 - loss: 0.4084
Epoch 2/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1375 - loss: 0.3725 
Epoch 3/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1808 - loss: 0.4403 
Epoch 4/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1656 - loss: 0.3136 
Epoch 5/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1737 - loss: 0.2795  
Epoch 6/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1249 - loss: 0.1096 
Epoch 7/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1412 - loss: -0.0312 
Epoch 8/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1484 - loss: -0.4341 
Epoch 9/10
[1m12/12[0m [32m━