#### Results two agents environment TicTacToe 

Link: https://pettingzoo.farama.org/environments/classic/tictactoe/

##### Import libraries

In [1]:
import numpy as np
from pettingzoo.classic import tictactoe_v3
from ctransformers import AutoModelForCausalLM
import os
import numpy as np
import re
import requests
import time
import json

##### Selected models

In [3]:
# LLM API details (Modify if needed)
LLM_API_URL = "http://localhost:1234/v1/chat/completions"  # Change to your LM Studio API URL
MODEL_NAME ="Mistral-Nemo-Instruct-2407-GGUF"
MODEL_NAME = "Phi-4-mini-instruct-GGUF"


##### Creating instruction

In [2]:
instruction = """
You are an expert Tic-Tac-Toe player. You must always follow the best strategy to win the game or block your opponent from winning.

Use this reasoning process:

1. Check if YOU can win on this turn. If yes, make that move.
2. If not, check if your OPPONENT could win on their next move. If yes, block it.
3. Otherwise, choose the best available strategic position (like center or corners).

Output ONLY the number of the chosen move (0–8) based on your decision.

Think step-by-step before giving the final number. But output only the number, without explanation.
"""

##### Function to create user prompt

In [3]:
def build_prompt(board_array, valid_moves, agent_name):
    board_str = ""
    cell_num = 0
    for row in board_array:
        row_str = ""
        for cell in row:
            if cell[0] == 1:
                row_str += "X"
            elif cell[1] == 1:
                row_str += "O"
            else:
                row_str += str(cell_num)
            cell_num += 1
        board_str += row_str + "\n"

    current_player = "X" if agent_name == "player_1" else "O"
    opponent = "O" if current_player == "X" else "X"

    prompt = f"""
    You are playing as {current_player}. Your opponent is {opponent}.

    This is the current Tic-Tac-Toe board. The numbers indicate empty positions:

    {board_str}

    Valid move positions: {valid_moves.tolist()}

    Think step-by-step:
    - Can you win this turn? Play that move.
    - Can your opponent win next turn? Block it.
    - Otherwise, choose the best long-term position.

    Now make your move by writing only the position number (0–8).
    """
    return prompt.strip()

##### Function query the LLM

In [4]:

def query_llm(model, api_url, agent_name, instruction, prompt):
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": instruction},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.0,
        "max_tokens": 8,
        "stop": ["\n"]
    }

    try:
        # Added debug prints to check the payload and API URL
        #print(f"\n[{agent_name}] Sending request to: {api_url}")
        #print(f"[{agent_name}] Payload:\n{json.dumps(payload, indent=2)}")

        response = requests.post(api_url, json=payload)
        response.raise_for_status()
        raw = response.json()

        # Added debug prints to check the raw response
        #print(f"[{agent_name}] Raw response:\n{json.dumps(raw, indent=2)}")

        return raw["choices"][0]["message"]["content"].strip()

    except Exception as e:
        print(f"[{agent_name}] LLM call failed: {e}")
        return ""



##### Function move agent

In [5]:
def parse_llm_output(output, valid_moves, agent_name):
    match = re.search(r"\b(\d)\b", output.strip())
    if match:
        move = int(match.group(1))
        if move in valid_moves:
            return move
        else:
            print(f"[{agent_name}] LLM chose invalid move: {move}, not in {valid_moves}")
    else:
        print(f"[{agent_name}] LLM output not understood: '{output}'")

    fallback = int(np.random.choice(valid_moves))
    print(f"[{agent_name}] Fallback random move: {fallback}")
    return fallback


##### Defining agent class with get_action method 

In [6]:
class LLM_Agent:
    def __init__(self, name, model, api_url="http://localhost:1234/v1/chat/completions"):
        self.name = name
        self.model = model
        self.api_url = api_url
    
    def get_action(self, observation, agent_name):
        board = observation["observation"].reshape(3, 3, 2)
        valid_moves = np.flatnonzero(observation["action_mask"])

        prompt = build_prompt(board, valid_moves, agent_name)
        response = query_llm(self.model, self.api_url, self.name, instruction, prompt)

        action = parse_llm_output(response, valid_moves, self.name)
        print(f"[{self.name}] Chosen action: {action}\n")

        return action


#### Functions winner_board and final_board

In [7]:
# Use this to detect the winner 
def print_final_board(board, player_symbols):
    board_ = board.reshape(3, 3, 2)
    symbol_map = { (1,0): player_symbols["player_1"],  # X
                   (0,1): player_symbols["player_2"],  # O
                   (0,0): "." }                        # empty
    
    for row in board_:
        print("".join(symbol_map[tuple(cell)] for cell in row))
    print("\n")


# Use this function to detect the winner because the rewards from the environment are always 0
def detect_winner(board):
    board_ = board.argmax(axis=2)  # 0 = player_1 = X, 1 = player_2 = O
    for player, label in [(0, "player_1"), (1, "player_2")]:
        for i in range(3):
            if np.all(board_[i, :] == player) or np.all(board_[:, i] == player):
                return label
        if np.all(np.diag(board_) == player) or np.all(np.diag(np.fliplr(board_)) == player):
            return label
    return None

##### Function Run game with two different SLMs - agent_1: Mistral NeMo, agent_2: Phi-4 mini

In [None]:
agent_1 = LLM_Agent(
    name="Mistral_X",
    model="mistral-nemo-instruct-2407",
    api_url="http://localhost:1234/v1/chat/completions"
)

agent_2 = LLM_Agent(
    name="Phi-4_O",
    model="phi-4-mini-instruct",
    api_url="http://localhost:1234/v1/chat/completions"
)

##### Function Run game with two different SLMs - agent_1: Phi-4 mini, agent_2: Mistral NeMo

In [8]:
agent_1 = LLM_Agent(
        
    name="Phi-4_O",
    model="phi-4-mini-instruct",
    api_url="http://localhost:1234/v1/chat/completions"
)

agent_2 = LLM_Agent(
    name="Mistral_X",
    model="mistral-nemo-instruct-2407",
    api_url="http://localhost:1234/v1/chat/completions"
)

In [9]:
def run_game(render=False):
    render_mode = "human" if render else None
    env = tictactoe_v3.env(render_mode=render_mode)
    env.reset(seed=None)

    
    agents = {"player_1": agent_1, "player_2": agent_2}
    player_symbols = {"player_1": "X", "player_2": "O"}
    all_agents = env.possible_agents[:]
    move_history = []

    start_time = time.time()

    for agent_name in env.agent_iter():
        obs, reward, terminated, truncated, info = env.last()
        done = terminated or truncated

        action = agents[agent_name].get_action(obs, agent_name) if not done else None
        if action is not None:
            move_history.append((agent_name, action))

        env.step(action)

    end_time = time.time()
    elapsed_time = end_time - start_time

    final_obs = env.observe(all_agents[0])
    final_board = final_obs["observation"].reshape(3, 3, 2)
    winner = detect_winner(final_board)

    rewards = {agent: 0 for agent in all_agents}
    if winner:
        rewards[winner] = 1
        rewards[[a for a in all_agents if a != winner][0]] = -1

    print("🏁 Game Over!")
    print_final_board(final_board, player_symbols)

    print(f"⏱️ Duration: {elapsed_time:.2f} sec\nRewards: {rewards}")
    print(f"🏆 Winner: {agents[winner].name}" if winner else "🤝 Draw")

    for i, (agent, move) in enumerate(move_history):
        print(f"Turn {i+1}: {agent} → {move}")

    env.close()
    return rewards


##### Run game once to visualize the game

In [34]:
if __name__ == "__main__":
    run_game(render=True)

[Mistral_X] Chosen action: 2

[Phi-4_O] Chosen action: 5

[Mistral_X] Chosen action: 3

[Phi-4_O] Chosen action: 6

[Mistral_X] LLM chose invalid move: 5, not in [0 1 4 7 8]
[Mistral_X] Fallback random move: 1
[Mistral_X] Chosen action: 1

[Phi-4_O] Chosen action: 4

[Mistral_X] LLM chose invalid move: 3, not in [0 7 8]
[Mistral_X] Fallback random move: 7
[Mistral_X] Chosen action: 7

[Phi-4_O] LLM chose invalid move: 7, not in [0 8]
[Phi-4_O] Fallback random move: 8
[Phi-4_O] Chosen action: 8

[Mistral_X] LLM chose invalid move: 2, not in [0]
[Mistral_X] Fallback random move: 0
[Mistral_X] Chosen action: 0

🏁 Game Over!
XXX
XOO
OXO


⏱️ Duration: 29.51 sec
Rewards: {'player_1': 1, 'player_2': -1}
🏆 Winner: Mistral_X
Turn 1: player_1 → 2
Turn 2: player_2 → 5
Turn 3: player_1 → 3
Turn 4: player_2 → 6
Turn 5: player_1 → 1
Turn 6: player_2 → 4
Turn 7: player_1 → 7
Turn 8: player_2 → 8
Turn 9: player_1 → 0


##### Function Run game multiple times

In [10]:
def run_multiple_episodes(n_episodes=10):
    total_rewards = {"player_1": 0, "player_2": 0}
    wins = {"player_1": 0, "player_2": 0}
    draws = 0

    for episode in range(1, n_episodes + 1):
        print(f"\n=== 🎮 Episode {episode} ===")
        render = (episode == n_episodes)  # Show only the final game
        rewards = run_game(render=render)

        for agent, reward in rewards.items():
            total_rewards[agent] += reward
            if reward == 1:
                wins[agent] += 1

        if all(r == 0 for r in rewards.values()):
            draws += 1

    # Win rate report
    print("\n==============================")
    print(f"📊 Results after {n_episodes} episodes:")
    for agent in total_rewards:
        print(f"🏆 {agent} → Wins: {wins[agent]} | Total Score: {total_rewards[agent]}")
    print(f"🤝 Draws: {draws}")
    print("==============================\n")

##### Run Game multiple times agent_1 = Mistral NeMo and agent_2 = Phi-4 Mini

In [36]:
if __name__ == "__main__":
    run_multiple_episodes(n_episodes=10)


=== 🎮 Episode 1 ===
[Mistral_X] Chosen action: 2

[Phi-4_O] Chosen action: 5

[Mistral_X] Chosen action: 3

[Phi-4_O] Chosen action: 6

[Mistral_X] LLM chose invalid move: 5, not in [0 1 4 7 8]
[Mistral_X] Fallback random move: 1
[Mistral_X] Chosen action: 1

[Phi-4_O] Chosen action: 4

[Mistral_X] LLM chose invalid move: 3, not in [0 7 8]
[Mistral_X] Fallback random move: 0
[Mistral_X] Chosen action: 0

🏁 Game Over!
XXX
XOO
O..


⏱️ Duration: 23.03 sec
Rewards: {'player_1': 1, 'player_2': -1}
🏆 Winner: Mistral_X
Turn 1: player_1 → 2
Turn 2: player_2 → 5
Turn 3: player_1 → 3
Turn 4: player_2 → 6
Turn 5: player_1 → 1
Turn 6: player_2 → 4
Turn 7: player_1 → 0

=== 🎮 Episode 2 ===
[Mistral_X] Chosen action: 2

[Phi-4_O] Chosen action: 5

[Mistral_X] Chosen action: 3

[Phi-4_O] Chosen action: 6

[Mistral_X] LLM chose invalid move: 5, not in [0 1 4 7 8]
[Mistral_X] Fallback random move: 4
[Mistral_X] Chosen action: 4

[Phi-4_O] Chosen action: 1

[Mistral_X] LLM chose invalid move: 3, not i

In [28]:
if __name__ == "__main__":
    run_multiple_episodes(n_episodes=50)


=== 🎮 Episode 1 ===
[Mistral_X] Chosen action: 2

[Phi-4_O] Chosen action: 4

[Mistral_X] Chosen action: 5

[Phi-4_O] Chosen action: 6

[Mistral_X] Chosen action: 3

[Phi-4_O] LLM output not understood: 'Based on a strategic analysis of Tic'
[Phi-4_O] Fallback random move: 0
[Phi-4_O] Chosen action: 0

[Mistral_X] LLM chose invalid move: 3, not in [1 7 8]
[Mistral_X] Fallback random move: 8
[Mistral_X] Chosen action: 8

🏁 Game Over!
O.X
XOX
O.X


⏱️ Duration: 46.21 sec
Rewards: {'player_1': 1, 'player_2': -1}
🏆 Winner: Mistral_X
Turn 1: player_1 → 2
Turn 2: player_2 → 4
Turn 3: player_1 → 5
Turn 4: player_2 → 6
Turn 5: player_1 → 3
Turn 6: player_2 → 0
Turn 7: player_1 → 8

=== 🎮 Episode 2 ===
[Mistral_X] Chosen action: 2

[Phi-4_O] Chosen action: 4

[Mistral_X] Chosen action: 5

[Phi-4_O] Chosen action: 6

[Mistral_X] Chosen action: 3

[Phi-4_O] LLM output not understood: 'Based on a strategic analysis of Tic'
[Phi-4_O] Fallback random move: 1
[Phi-4_O] Chosen action: 1

[Mistral_X] 

##### Run Game multiple times agent_1 = Phi-4 Mini and agent_2 = Mistral NeMo

In [11]:
if __name__ == "__main__":
    run_multiple_episodes(n_episodes=50)


=== 🎮 Episode 1 ===
[Phi-4_O] Chosen action: 5

[Mistral_X] Chosen action: 3

[Phi-4_O] Chosen action: 4

[Mistral_X] Chosen action: 6

[Phi-4_O] Chosen action: 2

[Mistral_X] Chosen action: 1

[Phi-4_O] LLM output not understood: 'Based on a Tic-Tac-To'
[Phi-4_O] Fallback random move: 7
[Phi-4_O] Chosen action: 7

[Mistral_X] LLM chose invalid move: 2, not in [0 8]
[Mistral_X] Fallback random move: 8
[Mistral_X] Chosen action: 8

[Phi-4_O] LLM chose invalid move: 7, not in [0]
[Phi-4_O] Fallback random move: 0
[Phi-4_O] Chosen action: 0

🏁 Game Over!
XOX
OXX
OXO


⏱️ Duration: 54.17 sec
Rewards: {'player_1': 0, 'player_2': 0}
🤝 Draw
Turn 1: player_1 → 5
Turn 2: player_2 → 3
Turn 3: player_1 → 4
Turn 4: player_2 → 6
Turn 5: player_1 → 2
Turn 6: player_2 → 1
Turn 7: player_1 → 7
Turn 8: player_2 → 8
Turn 9: player_1 → 0

=== 🎮 Episode 2 ===
[Phi-4_O] Chosen action: 5

[Mistral_X] Chosen action: 3

[Phi-4_O] Chosen action: 4

[Mistral_X] Chosen action: 6

[Phi-4_O] Chosen action: 2

[Mi