In [1]:
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath("..")  # Adjust based on your folder structure
sys.path.append(project_root)

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import importlib
from collections import deque
import random
import numpy as np
import matplotlib.pyplot as plt
from agents.mcts.mcts_agent_negamax import MCTSAgent_negamax
from agents.mcts.mcts_node import MCTSNode
from tactix.utils import *
from tactix.tactixEnvironment_without_opp import TactixEnvironment
from tactix.tactixGame import TactixGame
import matplotlib.pyplot as plt
import os
from tqdm import tqdm 

importlib.reload(sys.modules['tactix.tactixGame'])
importlib.reload(sys.modules['tactix.tactixEnvironment_without_opp'])
importlib.reload(sys.modules['tactix.utils'])
importlib.reload(sys.modules['agents.mcts.mcts_agent_negamax'])

<module 'agents.mcts.mcts_agent_negamax' from '/Users/alibal/Desktop/tactix-game/agents/mcts/mcts_agent_negamax.py'>

In [3]:
random.seed(42)

In [4]:
import torch
import torch.nn as nn

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim, num_heads):
        super(SimpleNN, self).__init__()
        self.multihead_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, batch_first=True)
        self.hidden1 = nn.Linear(input_dim, hidden_dim1)
        self.hidden2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.output = nn.Linear(hidden_dim2, output_dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # Input shape: (batch_size, seq_len, input_dim)
        attn_output, _ = self.multihead_attention(x, x, x)  # Self-attention
        x = self.relu(attn_output)      # Activation
        x = self.relu(self.hidden1(x))  # Hidden layer 1
        x = self.relu(self.hidden2(x))  # Hidden layer 2
        x = self.output(x)              # Output layer
        return x

# Example usage
input_dim = 64
hidden_dim1 = 128
hidden_dim2 = 128
output_dim = 20
num_heads = 4       # or 1

model = SimpleNN(input_dim, hidden_dim1, hidden_dim2, output_dim, num_heads)

# Example input: batch_size=32, input_dim=64
input_tensor = torch.randn(32, input_dim)
output = model(input_tensor)
print(output.shape)  # Should be (32, output_dim)


torch.Size([32, 20])


In [31]:
import torch
import torch.nn as nn

class DQN(nn.Module):
    """
    A DQN with a single attention layer after the input state.
    """
    def __init__(self, state_size, action_size, layer_sizes):
        super(DQN, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        
        self.input_projection = nn.Linear(self.state_size, self.state_size * 3)
        # Attention layer
        self.multihead_attention = nn.MultiheadAttention(embed_dim=state_size * 3, num_heads=3, batch_first=True)
        
        
        # Hidden layers
        layers = []
        input_dim = state_size * 3
        for hidden_size in layer_sizes:
            layers.append(nn.Linear(input_dim, hidden_size))
            layers.append(nn.ReLU())
            input_dim = hidden_size
        
        # Final layer: from the last hidden layer to the action output
        layers.append(nn.Linear(input_dim, action_size))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass through the DQN with an attention layer.
        x: Input state tensor, shape [batch_size, state_size].
        """
        # Pass through the input projection
        x = self.input_projection(x)
        # Add a sequence dimension: (batch_size, state_size) -> (batch_size, 1, state_size)
        #x = x.unsqueeze(1)
        
        # Input shape: (batch_size, state_size)
        attn_output, _ = self.multihead_attention(x, x, x)  # Self-attention

        # Remove the sequence dimension: (batch_size, 1, state_size) -> (batch_size, state_size)
        #x = attn_output.squeeze(1)
        
        # Pass through the rest of the network
        return self.network(x)  # Outputs Q-values for each action
    
model = DQN(state_size=25, action_size=125, layer_sizes=[128, 128])
input_tensor = torch.randn(64, 25)
output = model(input_tensor)
print(output.shape)  # Should be (64, 125)
# print(x.shape)
# x = x.squeeze(1)
# print(x.shape)

torch.Size([64, 125])


In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F

HAS_ATTENTION = True

class DQNWithDropoutAndResidual(nn.Module):
    def __init__(self, state_size, action_size, num_hidden_layers, hidden_dim, dropout_prob=0.5):
        super(DQNWithDropoutAndResidual, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.num_hidden_layers = num_hidden_layers
        self.embedding_dim = hidden_dim

        # Projection layer 
        self.projection =  nn.Linear(state_size, self.embedding_dim)

        # Attention layer
        if HAS_ATTENTION:
            # Attention - Single or Multihead
            num_heads = 4           # has to divide hidden_dim
            self.attention = nn.MultiheadAttention(embed_dim=self.embedding_dim, num_heads=num_heads, batch_first=True)
        
        # Hidden layers
        self.hidden_layers = [nn.Linear(self.embedding_dim, self.embedding_dim) for i in range(num_hidden_layers)]

        # ReLu and Dropout
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
        
        # Output layer
        self.output_layer = nn.Linear(self.embedding_dim, action_size)
    
    def forward(self, x):
        # Projection
        x = self.projection(x)

        # Attention
        if HAS_ATTENTION:
            x, _ = self.attention(x,x,x)
            x = self.relu(x)
        
        # Hidden layer 1 - no residual needed
        h1 = self.relu(self.hidden_layers[0](x))
        h1 = self.dropout(h1)

        # Rest of the hidden layers
        h = h1
        for i in range(1, self.num_hidden_layers):
            h = self.relu(self.hidden_layers[i](h + x))
            h = self.dropout(h)
        
        # Output layer
        out = self.output_layer(h)
        return out

# Example instantiation
model = DQNWithDropoutAndResidual(state_size=25, action_size=125, num_hidden_layers=3, hidden_dim=128, dropout_prob=0.5)
input_tensor = torch.rand(64,25)
output = model(input_tensor)
output.shape

torch.Size([64, 125])

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

HAS_ATTENTION = True

class DQNWithDropoutAndResidual(nn.Module):
    def __init__(self, state_size, action_size, layer_sizes):
        super(DQNWithDropoutAndResidual, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.num_hidden_layers = len(layer_sizes)
        self.embedding_dim = layer_sizes[0]

        # Projection layer 
        self.projection =  nn.Linear(state_size, self.embedding_dim)

        # Attention layer
        if HAS_ATTENTION:
            # Attention - Single or Multihead
            num_heads = 4           # has to divide hidden_dim
            self.attention = nn.MultiheadAttention(embed_dim=self.embedding_dim, num_heads=num_heads, batch_first=True)
        
        # Hidden layers
        self.hidden_layers = nn.ModuleList([nn.Linear(self.embedding_dim, self.embedding_dim) for i in range(self.num_hidden_layers)])

        # Dropout probability
        dropout_prob = 0.5

        # ReLu and Dropout
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
        
        # Output layer
        self.output_layer = nn.Linear(self.embedding_dim, action_size)
    
    def forward(self, x):
        # Projection
        x = self.projection(x)

        # Attention
        if HAS_ATTENTION:
            x, _ = self.attention(x,x,x)
            x = self.relu(x)
        
        # Hidden layer 1 - no residual needed
        h1 = self.relu(self.hidden_layers[0](x))
        h1 = self.dropout(h1)

        # Rest of the hidden layers
        h = h1
        for i in range(1, self.num_hidden_layers):
            h = self.relu(self.hidden_layers[i](h + x))
            h = self.dropout(h)
        
        # Output layer
        out = self.output_layer(h)
        return out

# Example instantiation
model = DQNWithDropoutAndResidual(state_size=25, action_size=125, layer_sizes=[128, 128, 128])
input_tensor = torch.rand(64,25)
output = model(input_tensor)
output.shape
print(model)


DQNWithDropoutAndResidual(
  (projection): Linear(in_features=25, out_features=128, bias=True)
  (hidden_layers): ModuleList(
    (0-2): 3 x Linear(in_features=128, out_features=128, bias=True)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (output_layer): Linear(in_features=128, out_features=125, bias=True)
)


In [80]:
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, transition):
        """Store a transition (s, a, r, s')"""
        self.memory.append(transition)

    def sample(self, batch_size):
        """Sample a batch of transitions"""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [81]:
class DQNAgent:
    def __init__(
        self, 
        state_size, 
        action_size, 
        layer_sizes,
        lr=1e-3, 
        gamma=0.9, 
        epsilon_start=1.0, 
        epsilon_end=0.01, 
        epsilon_decay=0.999876,
        memory_capacity=10000,
        device='cpu',
        pretrained_model_path = None
    ):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.device = device
        
        # Q-Networks (main + target)
        self.q_network = DQN(state_size, action_size, layer_sizes).to(self.device)
        self.target_network = DQN(state_size, action_size, layer_sizes).to(self.device)

        if pretrained_model_path:
            self.q_network.load_state_dict(torch.load(pretrained_model_path, map_location=self.device))
            print(f"Loaded pretrained model from {pretrained_model_path}")

        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()

        # Optimizer
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

        # Replay Memory
        self.memory = ReplayMemory(capacity=memory_capacity)

    def select_action(self, state, valid_moves_mask):
        """
        Epsilon-greedy action selection with invalid move masking.
        state: shape (1, state_size)
        valid_moves_mask: shape (1, action_size) -> 1/0 for valid moves
        """
        if random.random() < self.epsilon:
            valid_indices = torch.where(valid_moves_mask[0] == 1)[0]
            action = random.choice(valid_indices.tolist())
            return action
        else:
            with torch.no_grad():
                q_values = self.q_network(state.to(self.device))  # (1, action_size)
                # Mask invalid actions by setting them to -inf
                q_values[valid_moves_mask == 0] = -float('inf')
                return q_values.argmax(dim=1).item()

    def update_target_network(self):
        """Update the target network to match the Q-network"""
        self.target_network.load_state_dict(self.q_network.state_dict())
 
    def train_step(self, batch_size):
        """Train the Q-network using one batch from experience replay."""
        if len(self.memory) < batch_size:
            return  # Not enough samples to train
        
        # Sample a batch of transitions
        transitions = self.memory.sample(batch_size)
        # transitions is a list of tuples: (state, action, reward, next_state, done)
        batch = list(zip(*transitions))

        states = torch.stack(batch[0]).to(self.device)          # shape: [batch_size, 1, state_size]
        actions = torch.stack(batch[1]).to(self.device)         # shape: [batch_size]
        rewards = torch.tensor(batch[2], dtype=torch.float32).to(self.device)  # [batch_size]
        next_states = torch.stack(batch[3]).to(self.device)     # shape: [batch_size, 1, state_size]
        #next_states_valid_moves_mask = torch.stack(batch[4]).to(self.device)  # shape: [batch_size, 1, action_size]
        dones = torch.tensor(batch[4], dtype=torch.bool).to(self.device)       # [batch_size]
        
        # Flatten states: we have [batch_size, 1, state_size] => [batch_size, state_size]
        states = states.view(states.size(0), -1)
        next_states = next_states.view(next_states.size(0), -1)

        # Flatten next_states_valid_moves_mask: [batch_size, 1, action_size] => [batch_size, action_size]
        #next_states_valid_moves_mask = next_states_valid_moves_mask.view(next_states_valid_moves_mask.size(0), -1)

        # Current Q-values
        q_values = self.q_network(states)
        # Gather Q-values for the taken actions
        # q_values shape is [batch_size, action_size], actions is [batch_size]
        q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

        # Target Q-values
        with torch.no_grad():  
            #temp_next_q_values = self.target_network(next_states)
            #temp_next_q_values[next_states_valid_moves_mask == 0] = -float('inf')
            #max_next_q_values = temp_next_q_values.max(1)[0]
            max_next_q_values = self.target_network(next_states).max(1)[0]
            target_q_values = rewards + (1 - dones.float()) * self.gamma * max_next_q_values
        
        # Loss and optimization
        loss = nn.SmoothL1Loss()(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

In [82]:
LAYERS = [[50, 125]]
GAMMAS = [0.95]
STATE_SIZE = 25
ACTION_SIZE = 125



class TrainAndPlot:
    def __init__(self,
                env_size = 5,
                n_episodes=100000, 
                max_t=1000, 
                batch_sizes=[64],
                layers = None, 
                gammas = None, 
                epsilon_min=0.05, 
                epsilon_max=1.0, 
                epsilon_decay=0.99995, 
                memory_capacity=50000, 
                device='cpu', 
                target_update_freq=1000, 
                lr=1e-4,
                log_interval=1000,
                mcts_iteration=1000,
                mcts_lr=1/np.sqrt(2)):
        
        self.env = TactixEnvironment(board_size=env_size)
        self.n_episodes = n_episodes
        self.max_t = max_t
        self.batch_sizes = batch_sizes
        self.layers = layers if layers else LAYERS
        self.gammas = gammas if gammas else GAMMAS
        self.epsilon_min = epsilon_min
        self.epsilon_max = epsilon_max
        self.epsilon_decay = epsilon_decay
        self.memory_capacity = memory_capacity
        self.device = device
        self.target_update_freq = target_update_freq
        self.lr = lr
        self.log_interval = log_interval
        self.mcts_iteration = mcts_iteration
        self.mcts_lr = mcts_lr

    def run_training(self, layer_structure, gamma, models_dir, batch_size, pretrained_model_path=None):
    

        
        # Create agent
        agent = DQNAgent(
            state_size=self.env.game.height ** 2,
            action_size=self.env.game.height ** 3,
            layer_sizes=layer_structure,
            lr=self.lr,
            gamma=gamma,
            epsilon_start=self.epsilon_max,
            epsilon_end=self.epsilon_min,
            epsilon_decay=self.epsilon_decay,
            memory_capacity=self.memory_capacity,
            pretrained_model_path=pretrained_model_path
        )
        
        # Logging
        rewards_log = []
        cumulative_rewards_log = []
        win_log = []
        epsilon_log = []
        total_reward = 0.0  # For cumulative tracking
        
        # Initialize variables for tracking the ultimate best model
        ultimate_best_win_rate = float('-inf')
        ultimate_best_cumulative_reward = float('-inf')
        ultimate_best_model_path = None  # Track the path of the ultimate best model
        
        
        progress_bar = tqdm(range(self.n_episodes), desc="Initializing Training...", unit="episode", dynamic_ncols=True)
        for episode in progress_bar:
            state, valid_moves_mask = self.env.reset()

            mcts_agent = MCTSAgent_negamax(player=-1, iterations=self.mcts_iteration, exploration_weight=self.mcts_lr)

            state = state.view(-1).unsqueeze(0)         # shape: [1, state_size]
            valid_moves_mask = valid_moves_mask.unsqueeze(0)  # shape: [1, action_size]

            episode_reward = 0
            done = False

            while not done:

                
                if self.env.game.current_player == -1:
                    # MCTS agent makes a move
                    curr_node = MCTSNode(self.env.game)
                    mcts_best_node = mcts_agent.best_child(curr_node)
                    
                    self.env.game = mcts_best_node.state
                    game_ended = self.env.game.getGameEnded()
                
                    if game_ended and game_ended.is_ended:
                        done = True
                        #print('MCTS lost')

                if not done:
                    self.env.state = self.env.game.getPieces()
                    state = self.env._get_observation()
                    valid_moves_mask = self.env._generate_valid_moves_mask()
                    
                    if not (state.dim() == 2 and state.size(0) == 1 and state.size(1) == self.state_size):
                        state = state.view(-1).unsqueeze(0)  # Shape: [1, state_size]
                    if not (valid_moves_mask.dim() == 2 and valid_moves_mask.size(0) == 1 and valid_moves_mask.size(1) == self.action_size):
                        valid_moves_mask = valid_moves_mask.unsqueeze(0)  # Shape: [1, action_size]
                    
                    action = agent.select_action(state, valid_moves_mask)
                    next_state, reward, done = self.env.step(action) # next_state after agent made a move -> s'
                    next_state = next_state.view(-1).unsqueeze(0)  # shape: [1, state_size]
                    #print(f"DQN got a reward:{reward}")

                    # Push to replay
                    agent.memory.push((state.cpu(), 
                                    torch.tensor(action).cpu(), 
                                    reward, 
                                    next_state.cpu(),
                                    #next_state_valid_moves_mask.cpu(), 
                                    done))

                    # Train
                    agent.train_step(batch_size)

                    
                    state = next_state
                    
                
                    episode_reward += reward

            
            # Update target
            if episode % self.target_update_freq == 0:
                agent.update_target_network()

            # Logging
            total_reward += episode_reward
            rewards_log.append(episode_reward)
            cumulative_rewards_log.append(total_reward)
            win_log.append(1 if episode_reward > 0 else 0)
            epsilon_log.append(agent.epsilon)


            if len(win_log) >= 200:
                avg_win_rate = 200.0 * np.mean(win_log[-200:])
                current_cumulative_reward = cumulative_rewards_log[-1] if cumulative_rewards_log else 0

                # Save the model only if:
                # 1. The new win rate is greater than the best win rate seen so far, or
                # 2. The new win rate equals the best win rate, but the cumulative reward is higher
                if (avg_win_rate > ultimate_best_win_rate) or (
                        avg_win_rate == ultimate_best_win_rate and current_cumulative_reward > ultimate_best_cumulative_reward):
                    ultimate_best_win_rate = avg_win_rate
                    ultimate_best_cumulative_reward = current_cumulative_reward

                    # Update the model state and name
                    ultimate_best_model_state = agent.q_network.state_dict()
                    ultimate_best_model_name = (
                        f"network_hl_{'_'.join(map(str, layer_structure))}_gamma_{gamma:.2f}_"
                        f"bs_{batch_size}_tufq_{self.target_update_freq}_mcts_iter_{mcts_iteration}_mcts_lr_{mcts_lr}_"
                        f"wr_{int(ultimate_best_win_rate)}_tr_{int(ultimate_best_cumulative_reward)}.pth"
                    )

                    
            # Print progress occasionally
            # if (episode+1) % self.log_interval == 0:  # Log interval
            #     avg_reward = np.mean(rewards_log[-100:]) if len(rewards_log) > 100 else np.mean(rewards_log)
            #     win_rate = 100.0 * np.mean(win_log[-100:]) if len(win_log) > 100 else 100.0 * np.mean(win_log)
            #     print(f"[{episode+1}/{self.n_episodes}] Layers={layer_structure}, Gamma={gamma}, "
            #         f"AvgReward(Last100)={avg_reward:.2f}, WinRate(Last100)={win_rate:.2f}%, Eps={agent.epsilon:.3f}")
            if episode % 10000 == 0 and len(win_log) >= 200:
                avg_reward = np.mean(rewards_log[-200:])
                win_rate = 100.0 * np.mean(win_log[-200:])
                progress_bar.set_description(
                    f"AvgReward={avg_reward:.2f}, WinRate={win_rate:.2f}%, "
                    f"Eps={agent.epsilon:.3f}"
                )
                
        if ultimate_best_model_state and ultimate_best_model_name:
            ultimate_best_model_path = os.path.join(models_dir, ultimate_best_model_name)
            torch.save(ultimate_best_model_state, ultimate_best_model_path)
            print(f"Ultimate best model saved: {ultimate_best_model_path}")

        return rewards_log, cumulative_rewards_log, win_log, epsilon_log, ultimate_best_win_rate, ultimate_best_cumulative_reward





    def run_experiments(self, pretrained_model_path=None):
        
        # Centralized directory setup
        base_dir = "/Users/alibal/Desktop/tactix_training"
        save_dir = os.path.join(base_dir,f"training_results_{self.env.game.height}x{self.env.game.height}_randomopponent_s'_after_agent_withattention_mcts2")
        models_dir = os.path.join(save_dir, "models")
        plots_dir = os.path.join(save_dir, "plots")

        # Ensure directories exist
        os.makedirs(models_dir, exist_ok=True)
        os.makedirs(plots_dir, exist_ok=True)
        
        results = {}  # (layer_tuple, gamma) -> (rewards_log, cumulative_rewards_log, win_log, epsilon_log)
        for batch_size in self.batch_sizes:
            for layer in self.layers:
                for gamma in self.gammas:
                    print(f"=== Training with LayerStructure={layer}, Gamma={gamma}, Batch Size={batch_size}, Epsilon(max, min)={self.epsilon_max, self.epsilon_min}, mem_cap={self.memory_capacity}, Target Update={self.target_update_freq} ===")
                    r_log, c_log, w_log, e_log, ultimate_best_win_rate, ultimate_best_cumulative_reward = self.run_training(layer, gamma, models_dir, batch_size, pretrained_model_path=pretrained_model_path)
                    results[(tuple(layer), gamma)] = (r_log, c_log, w_log, e_log)
                    
                    # Plot results for this combination
                    fig, axs = plt.subplots(3, 1, figsize=(16, 16))
                    
                    # Prepare parameter text
                    parameter_text = (
                        f"n_episodes={self.n_episodes}, max_t={self.max_t}, batch_size={batch_size},\n"
                        f"board_size = {self.env.game.height}x{self.env.game.height}, layers={layer}, gamma={gamma:.2f},\n"
                        f"epsilon_min={self.epsilon_min}, epsilon_max={self.epsilon_max}, epsilon_decay={self.epsilon_decay},\n"
                        f"memory_capacity={self.memory_capacity}, device={self.device}, target_update_freq={self.target_update_freq},\n"
                        f"lr={self.lr}, mcts_iteration={self.mcts_iteration}, mcts_lr={self.mcts_lr}"
                    )
                    
                    # 1) Rewards
                    axs[0].plot(r_log, label="Rewards")
                    rolling_avg_r = [np.mean(r_log[max(0, i-1000):i+1]) for i in range(len(r_log))]
                    axs[0].plot(rolling_avg_r, label="Average Rewards (Last 1000)")
                    axs[0].set_xlabel("Episode")
                    axs[0].set_ylabel("Reward")
                    axs[0].set_title(f"Rewards - Layers={layer}, Gamma={gamma}", fontsize=14)
                    
                    axs[0].legend()
                    axs[0].grid()
                    
                    # 2) Cumulative Rewards
                    axs[1].plot(c_log, label="Cumulative Rewards")
                    axs[1].set_xlabel("Episode")
                    axs[1].set_ylabel("Total Reward")
                    axs[1].set_title(f"Cumulative Rewards - Layers={layer}, Gamma={gamma}")
                    axs[1].legend()
                    axs[1].grid()
                    
                    # 3) Win Rate
                    rolling_win = [100.0*np.mean(w_log[max(0, i-1000):i+1]) for i in range(len(w_log))]
                    axs[2].plot(rolling_win, label="Win Rate (Last 1000 Episodes)")
                    axs[2].set_xlabel("Episode")
                    axs[2].set_ylabel("Win Rate (%)")
                    axs[2].set_title(f"Win Rate - Layers={layer}, Gamma={gamma}")
                    axs[2].legend()
                    axs[2].grid()

                

                    # Add parameter text at the top
                    fig.text(
                        0.5, 1.02,  # Position above the subplots
                        parameter_text,
                        ha='center',
                        va='bottom',
                        fontsize=9
                    )

                    # Convert the parameter_text into a single-line string for the file name
                    parameters_for_filename = (
                        f"numep_{self.n_episodes}_bs_{batch_size}_"
                        f"hl_{'_'.join(map(str, layer))}_"
                        f"gamma_{gamma:.2f}_"
                        f"mem_cap_{self.memory_capacity}_"
                        f"tufq_{self.target_update_freq}_lr_{self.lr}_"
                        f"wr_{int(ultimate_best_win_rate)}_tr_{int(ultimate_best_cumulative_reward)}"
                    )

                    # Replace any characters that are invalid in file names (e.g., colons, slashes, spaces)
                    parameters_for_filename = parameters_for_filename.replace(":", "_").replace(" ", "_").replace("/", "_").replace(".", "_")

                    # Adjust layout to leave space at the top for the parameter text
                    plt.tight_layout(rect=[0, 0, 1, 0.85])  # Leave 5% space at the top
                    plt.subplots_adjust(top=0.8)  # Adjust top space explicitly

                    # Save the plot
                    plot_name = f"{parameters_for_filename}.png"
                    plot_path = os.path.join(plots_dir, plot_name)
                    plt.savefig(plot_path, bbox_inches="tight")  # Save all elements, ensuring no clipping
                    plt.show()
                    
        return results

In [83]:
experiment_mcts2 = TrainAndPlot(n_episodes=100000, max_t=1000, batch_sizes=[128], layers= [[128,128,128]],
                                                                    gammas=[0.7], epsilon_min=0.01, epsilon_max=1.0, epsilon_decay=0.9995, 
                                                                    memory_capacity=100000, device='cpu', target_update_freq=100, lr=0.0001, log_interval=100, mcts_iteration=50, mcts_lr=0.4)
results_mcts = experiment_mcts2.run_experiments();

=== Training with LayerStructure=[128, 128, 128], Gamma=0.7, Batch Size=128, Epsilon(max, min)=(1.0, 0.01), mem_cap=100000, Target Update=100 ===


Initializing Training...:   0%|          | 199/100000 [00:17<2:23:31, 11.59episode/s]


NameError: name 'mcts_iteration' is not defined