#### Results two agents environment TicTacToe 

Link: https://pettingzoo.farama.org/environments/classic/tictactoe/

##### Import libraries

In [12]:
import numpy as np
from pettingzoo.classic import tictactoe_v3
from ctransformers import AutoModelForCausalLM
import os
import numpy as np
import re
import requests
import time

##### Selected models

In [13]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
#MODEL_NAME ="Mistral-Nemo-Instruct-2407-GGUF"
MODEL_NAME = "Phi-4-mini-instruct-GGUF"
#MODEL_NAME ="Qwen2-0.5B-Instruct-GGUF"

##### Creating the instruction

In [14]:
instruction = """
You are an expert Tic-Tac-Toe player. You must always follow the best strategy to win the game or block your opponent from winning.

Use this reasoning process:

1. Check if YOU can win on this turn. If yes, make that move.
2. If not, check if your OPPONENT could win on their next move. If yes, block it.
3. Otherwise, choose the best available strategic position (like center or corners).

Output ONLY the number of the chosen move (0–8) based on your decision.

Think step-by-step before giving the final number. But output only the number, without explanation.
"""

##### Creating the prompt

In [15]:
def build_prompt(board_array, valid_moves, agent_name):
    board_str = ""
    cell_num = 0
    for row in board_array:
        row_str = ""
        for cell in row:
            if cell[0] == 1:
                row_str += "X"
            elif cell[1] == 1:
                row_str += "O"
            else:
                row_str += str(cell_num)
            cell_num += 1
        board_str += row_str + "\n"

    current_player = "X" if agent_name == "player_1" else "O"
    opponent = "O" if current_player == "X" else "X"

    prompt = f"""
    You are playing as {current_player}. Your opponent is {opponent}.

    This is the current Tic-Tac-Toe board. The numbers indicate empty positions:

    {board_str}

    Valid move positions: {valid_moves.tolist()}

    Think step-by-step:
    - Can you win this turn? Play that move.
    - Can your opponent win next turn? Block it.
    - Otherwise, choose the best long-term position.

    Now make your move by writing only the position number (0–8).
    """
    return prompt.strip()

##### Calling the LLM

In [16]:

def query_llm(instruction, prompt):
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": instruction},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.0,
        "max_tokens": 8,
        "stop": ["\n"]
    }

    try:
        response = requests.post(LLM_API_URL, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f" LLM call failed: {e}")
        return ""


##### Defining the move of the agent

In [17]:

def parse_llm_output(output, valid_moves):
    match = re.search(r"\b(\d)\b", output.strip())
    if match:
        move = int(match.group(1))
        if move in valid_moves:
            return move
        else:
            print(f"LLM chose invalid move: {move}, not in valid_moves {valid_moves}")
    else:
        print(f"LLM output not understood: {output}")

    # Fallback to random valid move
    fallback = int(np.random.choice(valid_moves))
    print(f"allback to random valid move: {fallback}")
    return fallback


##### Defining the agent class with action

In [18]:

class LLM_Agent:
    def __init__(self, name="LLM"):
        self.name = name

    def get_action(self, observation, agent_name):
        board = observation["observation"].reshape(3, 3, 2)
        valid_moves = np.flatnonzero(observation["action_mask"])

        prompt = build_prompt(board, valid_moves, agent_name)
        response = query_llm(instruction, prompt)

        #print(f"\n[{self.name}] Prompt sent:\n{prompt}")
        #print(f"[{self.name}] LLM Response: {response}")

        action = parse_llm_output(response, valid_moves)
        print(f"[{self.name}] Chosen action: {action}\n")

        return action



##### Running the game function -- using rewards function from environment -- this is not working as expected.

In [None]:
# Use this function to print the final board in a readable format and to show the winner
def print_final_board(board):
    for row in board:
        row_str = ""
        for cell in row:
            if cell[0] == 1:
                row_str += "X"
            elif cell[1] == 1:
                row_str += "O"
            else:
                row_str += " "  # No marker for empty cells
        print(row_str)

def run_game():
    
    # Set up the environment
    env = tictactoe_v3.env(render_mode="human")
    env.reset(seed=42)

    agent_1 = LLM_Agent(name="Mistral_X")
    agent_2 = LLM_Agent(name="Mistral_O")
    agents = {"player_1": agent_1, "player_2": agent_2}
    all_agents = env.possible_agents[:]  # Always includes both

    move_history = []
    start_time = time.time()

    # Randomize first move if X starts
    if env.agent_selection == "player_1":
        obs, _, _, _, _ = env.last()
        valid_moves = np.flatnonzero(obs["action_mask"])
        first_action = int(np.random.choice(valid_moves))
        env.step(first_action)
        move_history.append(("player_1", first_action))

    # Main game loop
    for agent_name in env.agent_iter():
        obs, reward, terminated, truncated, info = env.last()
        done = terminated or truncated

        if not done:
            action = agents[agent_name].get_action(obs, agent_name)
            move_history.append((agent_name, action))
        else:
            action = None

        env.step(action)

    # Finalisation: step any remaining agent with None
    if env.agents:
        obs, _, _, _, _ = env.last()
        env.step(None)  # Let last agent acknowledge terminal state

    end_time = time.time()
    elapsed_time = end_time - start_time

    # Get final rewards
    rewards = {agent: env.rewards.get(agent, 0) for agent in all_agents}

    # Get final board
    final_obs = obs["observation"].reshape(3, 3, 2)

    print("🏁 Game Over!")
    print(f"\nFinal Board:")
    print_final_board(final_obs)

    print(f"\nGame duration: {elapsed_time:.2f} seconds")
    print("Final Rewards:", rewards)

    # Determine the winner - based on the rewards- is not working as expected
    if rewards["player_1"] == 1:
        print("🏆 Winner: Mistral_X (player_1)")
    elif rewards["player_2"] == 1:
        print("🏆 Winner: Mistral_O (player_2)")
    else:
        print("🤝 It's a draw!")

    print("\nMove History:")
    for i, (agent, move) in enumerate(move_history):
        print(f"Turn {i+1}: {agent} → {move}")





##### Update version of Running the game with manual assigning reward to winnin g agent. 

In [19]:
# Use this to detect the winner 
def print_final_board(board):
    for row in board:
        row_str = ""
        for cell in row:
            if cell[0] == 1:
                row_str += "X"
            elif cell[1] == 1:
                row_str += "O"
            else:
                row_str += " "
        print(row_str)

# Use this function to detect the winner because the rewards from the environment are always 0
def detect_winner(board):
    board = board.argmax(axis=2)  # 0 for X, 1 for O, 2 = empty
    for player, label in [(0, "player_1"), (1, "player_2")]:
        for i in range(3):
            if all(board[i, :] == player): return label
            if all(board[:, i] == player): return label
        if all(np.diag(board) == player): return label
        if all(np.diag(np.fliplr(board)) == player): return label
    return None

def run_game():
    env = tictactoe_v3.env(render_mode="human")
    env.reset(seed=42)

    agent_1 = LLM_Agent(name="Mistral_X")
    agent_2 = LLM_Agent(name="Mistral_O")
    agents = {"player_1": agent_1, "player_2": agent_2}
    all_agents = env.possible_agents[:]
    move_history = []

    # Randomize opening move
    if env.agent_selection == "player_1":
        obs, _, _, _, _ = env.last()
        valid_moves = np.flatnonzero(obs["action_mask"])
        first_action = int(np.random.choice(valid_moves))
        env.step(first_action)
        move_history.append(("player_1", first_action))

    start_time = time.time()
    last_obs = None

    # Main game loop
    for agent_name in env.agent_iter():
        obs, reward, terminated, truncated, info = env.last()
        if obs is not None:
            last_obs = obs

        done = terminated or truncated
        if not done:
            action = agents[agent_name].get_action(obs, agent_name)
            move_history.append((agent_name, action))
        else:
            action = None

        env.step(action)

    end_time = time.time()
    elapsed_time = end_time - start_time

    # Manually check winner
    final_board = last_obs["observation"].reshape(3, 3, 2)
    winner = detect_winner(final_board)

    rewards = {agent: 0 for agent in all_agents}
    if winner:
        rewards[winner] = 1
        loser = [a for a in all_agents if a != winner][0]
        rewards[loser] = -1

    # Final Output
    print("🏁 Game Over!")
    print("\nFinal Board:")
    print_final_board(final_board)

    print(f"\n⏱️ Game duration: {elapsed_time:.2f} seconds")
    print("🎯 Final Rewards:", rewards)

    if winner:
        print(f"🏆 Winner: {agents[winner].name} ({winner})")
    else:
        print("🤝 It's a draw!")

    print("\n Move History:")
    for i, (agent, move) in enumerate(move_history):
        print(f"Turn {i+1}: {agent} → {move}")

    print("\nPress any key in the game window to close...")


    env.close()  # Clean up environment window after game ends

##### Playing the game Mistral Nemo 2407 Instruct

In [11]:
if __name__ == "__main__":
    run_game()

[Mistral_O] Chosen action: 2

[Mistral_X] Chosen action: 3

[Mistral_O] Chosen action: 4

[Mistral_X] Chosen action: 6

🏁 Game Over!

Final Board:
X O
XO 
X  

⏱️ Game duration: 15.60 seconds
🎯 Final Rewards: {'player_1': 1, 'player_2': -1}
🏆 Winner: Mistral_X (player_1)

 Move History:
Turn 1: player_1 → 0
Turn 2: player_2 → 2
Turn 3: player_1 → 3
Turn 4: player_2 → 4
Turn 5: player_1 → 6

Press any key in the game window to close...


##### Paying the game Phi-4 mini instruct

In [None]:
if __name__ == "__main__":
    run_game()

[Mistral_O] Chosen action: 5

[Mistral_X] Chosen action: 6

[Mistral_O] Chosen action: 2

[Mistral_X] Chosen action: 3

LLM output not understood: Based on a strategic Tic-Tac
allback to random valid move: 7
[Mistral_O] Chosen action: 7

LLM chose invalid move: 7, not in valid_moves [0 1 8]
allback to random valid move: 8
[Mistral_X] Chosen action: 8

LLM chose invalid move: 7, not in valid_moves [0 1]
allback to random valid move: 1
[Mistral_O] Chosen action: 1

LLM chose invalid move: 7, not in valid_moves [0]
allback to random valid move: 0
[Mistral_X] Chosen action: 0

🏁 Game Over!

Final Board:
XOO
XXO
XOX

⏱️ Game duration: 24.71 seconds
🎯 Final Rewards: {'player_1': 1, 'player_2': -1}
🏆 Winner: Mistral_X (player_1)

 Move History:
Turn 1: player_1 → 4
Turn 2: player_2 → 5
Turn 3: player_1 → 6
Turn 4: player_2 → 2
Turn 5: player_1 → 3
Turn 6: player_2 → 7
Turn 7: player_1 → 8
Turn 8: player_2 → 1
Turn 9: player_1 → 0

Press any key in the game window to close...


: 