In [None]:
from typing import Protocol, List, Tuple, Callable, Dict, Type, Optional
import numpy as np
import random
import time
import pandas as pd
from tabulate import tabulate
import heapq
import copy
import logging
import os

SetType = List[Tuple]

def flatten_sets(nested_sets):
    return set(obj for group in nested_sets for obj in group)

def safe_deepcopy(sets: SetType) -> SetType:
    return copy.deepcopy(sets)


def swap_elements(sets: SetType, i: int, j: int, a_idx: int, b_idx: int) -> None:
    sets[i][a_idx], sets[j][b_idx] = sets[j][b_idx], sets[i][a_idx]



class Optimizer(Protocol):
    def __init__(self, sets: List[SetType], objective_function: Callable, params: Dict):
        pass
    
    def compute(self) -> None:
        pass
    
    def optimize(self) -> None:
        pass

def validate_sets(original_objects: List[SetType], final_sets: List[SetType]) -> bool:
    def count_elements(objects):
        counts = {}
        for obj in objects:
            key = tuple(obj)  
            counts[key] = counts.get(key, 0) + 1
        return counts
    
    original_counts = count_elements(original_objects)
    final_counts = count_elements([obj for s in final_sets for obj in s])
    
    return original_counts == final_counts

class GreedyOptimizer:
    def __init__(self, objects: SetType, m: int, objective_function: Callable, params: Dict = {}):
        self.objects = sorted(objects, key=lambda x: -x[1])
        self.sets = [[] for _ in range(m)]
        self.objective_function = objective_function
        self.score = float('inf')

    def compute(self) -> None:
        self.score = self.objective_function(self.sets)

    def optimize(self) -> None:
        heap = [(0, i) for i in range(len(self.sets))]
        heapq.heapify(heap)

        for obj in self.objects:
            size, min_index = heapq.heappop(heap)
            self.sets[min_index].append(obj)
            heapq.heappush(heap, (size + obj[1], min_index))

        self.compute()

class SwapOptimizer:
    def __init__(self, sets: List[SetType], objective_function: Callable, params: Dict):
        self.sets = sets
        self.objective_function = objective_function
        self.max_iterations = params.get("max_iterations", 1000)
        self.score = float('inf')

    def compute(self) -> None:
        self.score = self.objective_function(self.sets)

    def optimize(self) -> None:
        self.compute()  # compute initial score
        for _ in range(self.max_iterations):
            non_empty_sets = [i for i, s in enumerate(self.sets) if len(s) > 1]
            if len(non_empty_sets) < 2:
                break

            i, j = random.sample(non_empty_sets, 2)
            obj_i_idx, obj_j_idx = random.randrange(len(self.sets[i])), random.randrange(len(self.sets[j]))
            obj_i, obj_j = self.sets[i][obj_i_idx], self.sets[j][obj_j_idx]
            
            if obj_i == obj_j:
                continue

            # Perform swap
            self.sets[i][obj_i_idx], self.sets[j][obj_j_idx] = obj_j, obj_i

            # Evaluate new score
            new_score = self.objective_function(self.sets)

            if new_score < self.score:
                self.score = new_score
            else:
                # Revert swap
                self.sets[i][obj_i_idx], self.sets[j][obj_j_idx] = obj_i, obj_j


class SimulatedAnnealingOptimizer:
    def __init__(self, sets: List[SetType], objective_function: Callable, params: Dict):
        self.sets = sets
        self.objective_function = objective_function
        self.max_iterations = params.get("max_iterations", 1000)
        self.temperature = params.get("initial_temp", 10.0)
        self.cooling_rate = params.get("cooling_rate", 0.995)
        self.score = float("inf")

    def optimize(self) -> None:
        # Avalia score inicial
        current_score = self.objective_function(self.sets)
        best_sets = [list(s) for s in self.sets]
        best_score = current_score

        for _ in range(self.max_iterations):
            if self.temperature < 1e-6:
                break

            non_empty_sets = [i for i, s in enumerate(self.sets) if len(s) > 1]
            if len(non_empty_sets) < 2:
                break

            i, j = random.sample(non_empty_sets, 2)
            obj_i_idx, obj_j_idx = random.randrange(len(self.sets[i])), random.randrange(len(self.sets[j]))

            obj_i, obj_j = self.sets[i][obj_i_idx], self.sets[j][obj_j_idx]
            if obj_i == obj_j:
                continue

            # Swap
            self.sets[i][obj_i_idx], self.sets[j][obj_j_idx] = obj_j, obj_i
            new_score = self.objective_function(self.sets)
            delta = current_score - new_score

            # Critério de aceitação de Simulated Annealing
            if new_score < current_score or np.exp(delta / self.temperature) > random.random():
                current_score = new_score
                if new_score < best_score:
                    best_score = new_score
                    best_sets = [list(s) for s in self.sets]
            else:
                # Reverte swap
                self.sets[i][obj_i_idx], self.sets[j][obj_j_idx] = obj_i, obj_j

            self.temperature *= self.cooling_rate

        self.sets = best_sets
        self.score = best_score

class KernighanLinOptimizer:
    def __init__(self, sets: SetType, objective_function: Callable, params: Dict):
        self.sets = sets
        self.objective_function = objective_function
        self.params = params
        self.score = float("inf")

    def compute(self):
        self.score = self.objective_function(self.sets)

    def optimize(self):
        improved = True
        while improved:
            improved = False
            best_gain = 0
            best_move = None
            for i in range(len(self.sets)):
                for j in range(i + 1, len(self.sets)):
                    for a_idx, a in enumerate(self.sets[i]):
                        for b_idx, b in enumerate(self.sets[j]):
                            new_sets = safe_deepcopy(self.sets)
                            swap_elements(new_sets, i, j, a_idx, b_idx)
                            new_score = self.objective_function(new_sets)
                            gain = self.score - new_score
                            if gain > best_gain:
                                best_gain = gain
                                best_move = (i, j, a_idx, b_idx)
            if best_move:
                i, j, a_idx, b_idx = best_move
                swap_elements(self.sets, i, j, a_idx, b_idx)
                self.score -= best_gain
                improved = True


class MigrationOptimizer:
    def __init__(self, sets: SetType, objective_function: Callable, params: Dict):
        self.sets = sets
        self.objective_function = objective_function
        self.params = params
        self.score = float("inf")

    def compute(self):
        self.score = self.objective_function(self.sets)

    def optimize(self):
        improved = True
        while improved:
            improved = False
            best_gain = 0
            best_move = None
            for i in range(len(self.sets)):
                for j in range(len(self.sets)):
                    if i == j or not self.sets[i]:
                        continue
                    for idx, obj in enumerate(self.sets[i]):
                        new_sets = safe_deepcopy(self.sets)
                        obj_moved = new_sets[i].pop(idx)
                        new_sets[j].append(obj_moved)
                        new_score = self.objective_function(new_sets)
                        gain = self.score - new_score
                        if gain > best_gain:
                            best_gain = gain
                            best_move = (i, j, idx)
            if best_move:
                i, j, idx = best_move
                obj = self.sets[i].pop(idx)
                self.sets[j].append(obj)
                self.score -= best_gain
                improved = True


class TabuSearchOptimizer:
    def __init__(self, sets: SetType, objective_function: Callable, params: Dict):
        self.sets = sets
        self.objective_function = objective_function
        self.tabu_size = params.get("tabu_size", 50)
        self.iterations = params.get("max_iterations", 100)
        self.score = float("inf")

    def compute(self):
        self.score = self.objective_function(self.sets)

    def optimize(self):
        tabu_list = []
        best_score = self.score
        best_solution = safe_deepcopy(self.sets)

        for _ in range(self.iterations):
            best_gain = 0
            best_move = None
            for i in range(len(self.sets)):
                for j in range(i + 1, len(self.sets)):
                    for a_idx, a in enumerate(self.sets[i]):
                        for b_idx, b in enumerate(self.sets[j]):
                            move = ((i, a_idx), (j, b_idx))
                            if move in tabu_list:
                                continue
                            new_sets = safe_deepcopy(self.sets)
                            swap_elements(new_sets, i, j, a_idx, b_idx)
                            new_score = self.objective_function(new_sets)
                            gain = self.score - new_score
                            if gain > best_gain:
                                best_gain = gain
                                best_move = move
            if best_move:
                (i, a_idx), (j, b_idx) = best_move
                swap_elements(self.sets, i, j, a_idx, b_idx)
                self.score -= best_gain
                tabu_list.append(best_move)
                if len(tabu_list) > self.tabu_size:
                    tabu_list.pop(0)
                if self.score < best_score:
                    best_score = self.score
                    best_solution = safe_deepcopy(self.sets)
        self.sets = best_solution


class GeneticOptimizer:
    def __init__(self, sets: SetType, objective_function: Callable, params: Dict):
        self.sets = sets
        self.objective_function = objective_function
        self.population_size = params.get("population_size", 10)
        self.generations = params.get("generations", 20)
        self.random_seed = params.get("seed")
        if self.random_seed is not None:
            random.seed(self.random_seed)
            np.random.seed(self.random_seed)
        self.score = float("inf")

    def compute(self):
        self.score = self.objective_function(self.sets)

    def optimize(self):
        population = [safe_deepcopy(self.sets) for _ in range(self.population_size)]
        scores = [self.objective_function(p) for p in population]

        for _ in range(self.generations):
            ranked = sorted(zip(scores, population), key=lambda x: x[0])
            population = [x[1] for x in ranked[:self.population_size // 2]]
            new_population = []
            while len(new_population) < self.population_size:
                p1, p2 = random.sample(population, 2)
                child = self.crossover(p1, p2)
                if random.random() < 0.3:
                    self.mutate(child)
                new_population.append(child)
            population = new_population
            scores = [self.objective_function(p) for p in population]

        best_idx = np.argmin(scores)
        self.sets = population[best_idx]

    def crossover(self, p1: SetType, p2: SetType) -> SetType:
        return [random.choice([c1, c2]) for c1, c2 in zip(p1, p2)]

    def mutate(self, sets: SetType):
        if len(sets) < 2:
            return
        i, j = random.sample(range(len(sets)), 2)
        if sets[i] and sets[j]:
            a_idx = random.randint(0, len(sets[i]) - 1)
            b_idx = random.randint(0, len(sets[j]) - 1)
            swap_elements(sets, i, j, a_idx, b_idx)


class ContinuousRelaxationOptimizer:
    def __init__(self, sets: SetType, objective_function: Callable, params: Dict):
        self.sets = sets
        self.objective_function = objective_function
        self.params = params
        self.score = float("inf")

    def compute(self):
        self.score = self.objective_function(self.sets)

    def optimize(self):
        all_objects = [obj for subset in self.sets for obj in subset]
        if not all_objects:
            return

        num_clusters = len(self.sets)
        prob_matrix = np.random.dirichlet(np.ones(num_clusters), size=len(all_objects))
        new_sets = [[] for _ in range(num_clusters)]

        for idx, probs in enumerate(prob_matrix):
            chosen = np.argmax(probs)
            new_sets[chosen].append(all_objects[idx])

        self.sets[:] = new_sets

class OptimizationPipeline:
    def __init__(self, objects: SetType, m: int, objective_function: Callable, verbose: bool = False):
        self.objects = objects        
        self.m = m
        self.objective_function = objective_function
        self.steps = []
        self.sets = None
        self.verbose = verbose

    def add_step(self, optimizer_class: Optimizer, params: Dict = {}):
        self.steps.append((optimizer_class, params))
    
    def run(self):
        pipeline_start = time.time()
        step_results = []

        if not self.steps:
            raise ValueError("Empty steps are not allowed.")

        if self.verbose:
            print(f"\n🔍 Iniciando pipeline com {len(self.objects)} objetos no conjunto original.")

        for i, (optimizer_class, params) in enumerate(self.steps):
            step_start = time.time()

            if i == 0:
                if self.verbose:
                    print(f"\n🚀 Etapa {i+1}: {optimizer_class.__name__} (inicial)")
                optimizer = optimizer_class(self.objects, int(self.m), self.objective_function, params)
            else:
                if self.verbose:
                    print(f"\n🔄 Etapa {i+1}: {optimizer_class.__name__} (entrada com {len(flatten_sets(self.sets))} objetos)")
                optimizer = optimizer_class(self.sets, self.objective_function, params)

            optimizer.optimize()

            self.sets = optimizer.sets
            step_time = time.time() - step_start
            flattened_sets = flatten_sets(self.sets)

            if self.verbose:
                print(f"✅ Etapa {i+1} concluída - {len(flattened_sets)} objetos nos conjuntos otimizados.")
                print(f"📈 Score da métrica ({self.objective_function.__name__}): {optimizer.score:.4f}")
                print(f"⏱️ Tempo: {step_time:.2f} segundos")

            step_results.append({
                "optimizer": optimizer_class.__name__,
                "score": optimizer.score,
                "time": step_time
            })
        
        final_count = len(flatten_sets(self.sets))
        elapsed_time = time.time() - pipeline_start

        if self.verbose:
            print(f"\n🧾 Verificação final: {final_count} objetos no total em {elapsed_time} segundos.")

        if not validate_sets(self.objects, self.sets):
            raise ValueError("❌ Os conjuntos finais não correspondem aos objetos originais!")

        return {
            "sets": self.sets,
            "score": self.objective_function(self.sets),
            "time": elapsed_time,
            "steps": step_results
        }


class ClusterOptimizer:
    def __init__(
        self, 
        fname: str, 
        original_clusters: List, 
        pipeline_initializer: Callable[[List[SetType], int], "OptimizationPipeline"],
        verbose: bool = False
    ):
        self.fname = fname
        self.original_clusters = original_clusters
        self.pipeline_initializer = pipeline_initializer
        self.verbose = verbose

        self.df = self.load_data()
        self.df_original = self.filter_original_clusters()
        self.results = {}


    def load_data(self):
        columns = ['db_name', 'cluster_id', 'size_mb', 'access_daily_count']
        dtypes = {
            'db_name': str,
            'cluster_id': int,
            'size_mb': float,
            'access_daily_count': int
        }

        df = pd.read_excel(self.fname, usecols=columns, dtype=dtypes)

        duplicated_mask=df.duplicated(subset="db_name", keep=False)
        duplicated = df[duplicated_mask]

        if not duplicated.empty:
            logging.warning(f"{len(duplicated)} duplicated entries found for 'db_name'. Dropping duplicates and keeping first.")

        return df.drop_duplicates(subset="db_name", keep="first")

    def filter_original_clusters(self):
        columns=['db_name', 'cluster_id', 'size_mb', 'access_daily_count']
        cluster_mask=self.df['cluster_id'].isin(self.original_clusters)
        df_original=self.df[cluster_mask][columns]
        return df_original

    def prepare_sets(self):
        return [
            (row.db_name, row.size_mb, row.access_daily_count) 
            for row in self.df_original.itertuples(index=False)
        ]

    def optimize_clusters(self, num_clusters: int):
        sets = self.prepare_sets()
        pipeline = self.pipeline_initializer(sets, num_clusters)
        return pipeline.run()

    def map_clusters(self, optimized_df):
        optimized_columns=["db_name", "new_cluster"]
        merge_column="db_name"
        comparison_df = self.df_original.merge(
            optimized_df[optimized_columns], 
            on=merge_column, 
            how="left"
        )
        
        cluster_columns=["cluster_id", "new_cluster"]
        
        cluster_mapping = (
            comparison_df.groupby(cluster_columns)['db_name']
            .count().unstack(fill_value=0)
        )

        cluster_renaming = {}
        new_clusters = set(comparison_df["new_cluster"].unique())
        new_cluster_id = max(self.original_clusters) + 1

        for orig in self.original_clusters:
            if orig in cluster_mapping.index:
                best_match = cluster_mapping.loc[orig].idxmax()
                cluster_renaming[best_match] = orig
                new_clusters.discard(best_match)

        for cluster in cluster_mapping.columns.difference(cluster_renaming.keys()):
            cluster_renaming[cluster] = new_cluster_id
            new_cluster_id += 1

        comparison_df["new_cluster_mapped"] = comparison_df["new_cluster"].map(cluster_renaming)
        return comparison_df

    def generate_results(self, num_clusters: Optional[int] = None):
        min_clusters = len(self.original_clusters)
        if num_clusters is None:
            num_clusters = min_clusters
        elif num_clusters < min_clusters:
            raise ValueError(
                f"Provided 'num_clusters' = {num_clusters} is less than the number "
                f"of original clusters ({min_clusters}). It must be greater than or equal."
            )

        if num_clusters < min_clusters:
            raise ValueError(f"You must provide at least {min_clusters} clusters.")

        for k in range(min_clusters, num_clusters + 1):
            result = self.optimize_clusters(k)

            optimized_df = pd.DataFrame([
                {"db_name": db_name, "size_mb": size, "access_daily_count": access, "new_cluster": i}
                for i, cluster in enumerate(result["sets"])
                for db_name, size, access in cluster
            ]).drop_duplicates(subset="db_name", keep="first")

            comparison_df = self.map_clusters(optimized_df)
            comparison_df["status"] = comparison_df.apply(
                lambda row: "unchanged" if row["cluster_id"] == row["new_cluster_mapped"] else "changed",
                axis=1
            )

            comparison_df["movement"] = comparison_df.apply(
                lambda row: f"{row['cluster_id']} ➝ {row['new_cluster_mapped']}", axis=1
            )

            # Inclui TODOS os bancos, inclusive os que permaneceram
            movement_stats = comparison_df.groupby("movement").agg(total_dbs=("db_name", "count")).reset_index()

            new_cluster_metrics = comparison_df.groupby("new_cluster_mapped").agg(
                total_size=("size_mb", "sum"),
                total_accesses=("access_daily_count", "sum"),
                db_count=("db_name", "count")
            ).reset_index()

            self.results[k] = {
                "comparison_df": comparison_df,
                "movement_stats": movement_stats,
                "new_cluster_metrics": new_cluster_metrics,
            }

    def display_results(self):
        total_size_mb = self.df_original["size_mb"].sum()
        total_access_count = self.df_original["access_daily_count"].sum()

        print(f"\nDatabases count: {len(self.df_original)}")
        print(f"📦 Total size across all clusters: {total_size_mb:,.2f} MB")
        print(f"🔐 Total daily accesses across all clusters: {total_access_count:,}")

        for k, data in self.results.items():
            print(f"\n🔍 Results for {k} clusters:")
            comparison_df = data["comparison_df"]

            print("\n📊 Movement statistics:")
            movement_stats = data["movement_stats"].copy()
            movement_stats[['from_cluster', 'to_cluster']] = movement_stats['movement'].str.extract(r'(\d+)\s*➝\s*(\d+)').astype(int)
            movement_stats.sort_values(by=['from_cluster', 'to_cluster'], inplace=True)
            movement_stats.drop(columns=['from_cluster', 'to_cluster'], inplace=True)
            print(tabulate(movement_stats, headers="keys", tablefmt="pretty", showindex=False))

            total_dbs = comparison_df["db_name"].nunique()
            moved_dbs = comparison_df.query("status == 'changed'")["db_name"].nunique()

            print(f"\n📦 Total DBs: {total_dbs:,}")
            print(f"🚚 Moved DBs: {moved_dbs:,}")

            print("\n📊 Metrics for new clusters:")
            cluster_metrics = data["new_cluster_metrics"].copy()
            cluster_metrics["size_pct"] = (cluster_metrics["total_size"] / total_size_mb * 100).round(2)
            cluster_metrics["access_pct"] = (cluster_metrics["total_accesses"] / total_access_count * 100).round(2)
            cluster_metrics.sort_values(by="new_cluster_mapped", inplace=True)
            print(tabulate(cluster_metrics, headers="keys", tablefmt="pretty", showindex=False))
        
    def save_results(self, output_dir: str = ".", base_filename: str = "cluster_result"):
        """
        Salva os resultados da otimização de clusters em arquivos Excel.
        
        Args:
            output_dir (str): Diretório onde os arquivos serão salvos.
            base_filename (str): Prefixo dos arquivos Excel.
        """
        os.makedirs(output_dir, exist_ok=True)

        for k, data in self.results.items():
            df = data["comparison_df"].copy()
            df = df.rename(columns={"cluster_id": "from_cluster", "new_cluster_mapped": "to_cluster"})
            df = df[["db_name", "from_cluster", "to_cluster"]].sort_values(by="db_name")

            output_path = os.path.join(output_dir, f"{base_filename}_{k}_clusters.xlsx")
            df.to_excel(output_path, index=False)

            if self.verbose:
                print(f"[✔] Resultado salvo: {output_path}")

def weighted_balanced_mad_metric(
    sets: List[List[Tuple]],
    weights: Optional[List[float]] = None
) -> float:
    """
    Métrica de balanceamento multivariada com MAD normalizado ponderado.

    Args:
        sets: Lista de conjuntos, cada um com objetos como tuplas/listas indexáveis.
        weights: Pesos para cada feature. Se None, usa média simples.

    Returns:
        Score médio ponderado das MADs normalizadas.
    """
    if not sets or not any(sets):
        return 0.0

    indices = list(range(1, len(sets[0][0])))  # ignora o índice 0 (assumido como ID)

    set_vectors = [
        np.sum([[obj[i] for i in indices] for obj in s], axis=0)
        for s in sets
    ]
    set_vectors = np.array(set_vectors)

    medians = np.median(set_vectors, axis=0)
    abs_devs = np.abs(set_vectors - medians)
    mad = np.median(abs_devs, axis=0)
    norm_mad = np.where(medians != 0, mad / medians, 0)

    if weights is not None:
        weights = np.array(weights, dtype=float)

        if len(weights) != len(indices):
            raise ValueError(f"Número de pesos ({len(weights)}) não bate com o número de features ({len(indices)})")

        if np.any(weights < 0):
            raise ValueError("Todos os pesos devem ser não-negativos")

        if weights.sum() == 0:
            raise ValueError("A soma dos pesos deve ser maior que zero")

        weights = weights / weights.sum()
        return float(np.sum(norm_mad * weights))

    return float(np.mean(norm_mad))

In [None]:
PIPELINE_REGISTRY = {}

def register_pipeline(name):
    def wrapper(fn):
        PIPELINE_REGISTRY[name] = fn
        return fn
    return wrapper

def with_default_clusters(fn):
    def wrapper(sets, num_clusters=None):
        if num_clusters is None:
            num_clusters = len(getattr(sets, "original_clusters", sets))
        return fn(sets, num_clusters)
    return wrapper

@register_pipeline('default')
@with_default_clusters
def make_pipeline_default(sets, num_clusters):
    def objective_function(sets):
        return weighted_balanced_mad_metric(sets, weights=[0.4, 0.6])
    
    pipeline = OptimizationPipeline(sets, num_clusters, objective_function, verbose=True)
    pipeline.add_step(GreedyOptimizer)
    pipeline.add_step(SwapOptimizer, {"max_iterations": 1000})
    pipeline.add_step(SimulatedAnnealingOptimizer)
    pipeline.add_step(SwapOptimizer, {"max_iterations": 1000})
    return pipeline

@register_pipeline('basic')
@with_default_clusters
def make_pipeline_basic(sets, num_clusters):
    def objective(sets): return weighted_balanced_mad_metric(sets, [0.5, 0.5])

    pipeline = OptimizationPipeline(sets, num_clusters, objective, verbose=True)
    pipeline.add_step(GreedyOptimizer)
    pipeline.add_step(SwapOptimizer, {"max_iterations": 500})
    return pipeline

@register_pipeline('metaheuristic')
@with_default_clusters
def make_pipeline_metaheuristic(sets, num_clusters):
    def objective(sets): return weighted_balanced_mad_metric(sets, [0.4, 0.6])

    pipeline = OptimizationPipeline(sets, num_clusters, objective, verbose=True)
    pipeline.add_step(GreedyOptimizer)
    pipeline.add_step(SimulatedAnnealingOptimizer, {
        "max_iterations": 1500,
        "initial_temp": 20.0,
        "cooling_rate": 0.99
    })

    return pipeline

@register_pipeline('refinement')
@with_default_clusters
def make_pipeline_refinement(sets, num_clusters):
    def objective(sets): return weighted_balanced_mad_metric(sets, [0.5, 0.5])

    pipeline = OptimizationPipeline(sets, num_clusters, objective, verbose=False)
    pipeline.add_step(GreedyOptimizer)
    pipeline.add_step(KernighanLinOptimizer, {})
    pipeline.add_step(MigrationOptimizer, {})
    return pipeline

@register_pipeline('exhaustive')
@with_default_clusters
def make_pipeline_exhaustive(sets, num_clusters):
    def objective(sets): return weighted_balanced_mad_metric(sets, [0.3, 0.7])

    pipeline = OptimizationPipeline(sets, num_clusters, objective, verbose=True)
    pipeline.add_step(GreedyOptimizer)
    pipeline.add_step(GeneticOptimizer, {
        "population_size": 25,
        "generations": 50,
        "seed": 1337
    })
    pipeline.add_step(SimulatedAnnealingOptimizer, {
        "max_iterations": 2000,
        "initial_temp": 30.0,
        "cooling_rate": 0.995
    })
    pipeline.add_step(TabuSearchOptimizer, {
        "tabu_size": 100,
        "max_iterations": 400
    })
    pipeline.add_step(MigrationOptimizer, {})
    return pipeline

@register_pipeline('genetic_relaxation')
@with_default_clusters
def make_pipeline_genetic_relaxation(sets, num_clusters):
    def objective(sets): return weighted_balanced_mad_metric(sets, [0.6, 0.4])

    pipeline = OptimizationPipeline(sets, num_clusters, objective, verbose=True)
    pipeline.add_step(GeneticOptimizer, {
        "population_size": 20,
        "generations": 30,
        "seed": 42
    })
    pipeline.add_step(ContinuousRelaxationOptimizer, {})
    return pipeline

@register_pipeline('fast')
@with_default_clusters
def make_pipeline_fast(sets, num_clusters):
    def objective_function(sets):
        return weighted_balanced_mad_metric(sets, weights=[0.5, 0.5])
    
    pipeline = OptimizationPipeline(sets, num_clusters, objective_function, verbose=False)
    pipeline.add_step(GreedyOptimizer)
    pipeline.add_step(SwapOptimizer, {"max_iterations": 300})
    return pipeline

@register_pipeline('minimal')
@with_default_clusters
def make_pipeline_minimal(sets, num_clusters):
    def objective_function(sets):
        return weighted_balanced_mad_metric(sets, weights=[0.5, 0.5])
    
    pipeline = OptimizationPipeline(sets, num_clusters, objective_function, verbose=False)
    pipeline.add_step(GreedyOptimizer)
    return pipeline

def get_pipeline(name: str = 'default', verbose: bool = False):
    if name not in PIPELINE_REGISTRY:
        available = ', '.join(sorted(PIPELINE_REGISTRY.keys()))
        raise ValueError(
            f"Pipeline '{name}' is not registered.\n"
            f"Available pipelines are: {available}"
        )

    def wrapped_pipeline(sets, num_clusters=None):
        pipeline = PIPELINE_REGISTRY[name](sets, num_clusters)
        pipeline.verbose = verbose  # force override after construction
        return pipeline

    return wrapped_pipeline


In [1]:
from pathlib import Path
from typing import List
import traceback
import pandas as pd

# Configurações iniciais
FNAME = "clusters_20250404.xlsx"
ORIGINAL_CLUSTERS = [5438, 5439]
OUTPUT_DIR = Path("pipeline_test_results")
CLUSTER_COUNTS_TO_TEST = [2]
AVAILABLE_PIPELINES = list(PIPELINE_REGISTRY.keys())

OUTPUT_DIR.mkdir(exist_ok=True)

def run_optimizer_test(pipeline_name: str, cluster_count: int):
    print(f"\n[TESTANDO] Pipeline: '{pipeline_name}' | Clusters: {cluster_count}")
    try:
        selected_pipeline = get_pipeline(pipeline_name, verbose=False)

        optimizer = ClusterOptimizer(
            fname=FNAME,
            original_clusters=ORIGINAL_CLUSTERS,
            pipeline_initializer=selected_pipeline,
            verbose=False
        )

        optimizer.generate_results(num_clusters=cluster_count)
        optimizer.display_results()

        result_path = OUTPUT_DIR / f"{pipeline_name}_{cluster_count}clusters.xlsx"
        optimizer.save_results(result_path)

        print(f"[✔] Resultado salvo em: {result_path}")

    except Exception as e:
        print(f"[✘] Erro com pipeline '{pipeline_name}' e {cluster_count} clusters.")
        print("Detalhes do erro:")
        traceback.print_exc()


def main(pipelines: List[str], cluster_counts: List[int]):
    for pipeline in pipelines:
        for count in cluster_counts:
            run_optimizer_test(pipeline, count)


if __name__ == "__main__":
    main(AVAILABLE_PIPELINES, CLUSTER_COUNTS_TO_TEST)


NameError: name 'PIPELINE_REGISTRY' is not defined