In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path

root = Path("unified")

# Dizionario dove salviamo tutti i dataset
all_datasets = {}

for csv_file in root.rglob("all_metrics_combined*.csv"):
    experiment_name = csv_file.parent.name  # nome della cartella esperimento
    df = pd.read_csv(csv_file)
    df["source"] = experiment_name.upper()  # aggiungiamo la colonna "source"
    all_datasets[experiment_name] = df
    print(f"Caricato cutomers {experiment_name} experiment ({len(df)} righe)")

# Se vuoi un unico DataFrame con tutto insieme
df_all = pd.concat(all_datasets.values(), ignore_index=True)

In [None]:
def plot_simple_nan_histogram(df, title="NaN Count Histogram"):

    nan_counts = df.isnull().sum()
    
    has_nan = nan_counts[nan_counts > 0]
    
    if len(has_nan) == 0:
        print("Nessun valore NaN trovato nel dataset")
        return
    
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(has_nan)), has_nan.values, color='red', alpha=0.7)
    
    plt.title(title, fontsize=16, fontweight='bold')
    plt.xlabel('Features', fontsize=12)
    plt.ylabel('NaN Count', fontsize=12)
    plt.xticks(range(len(has_nan)), has_nan.index, rotation=45, ha='right')
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Stampa anche un summary
    print(f"Totale features con NaN: {len(has_nan)}")
    print(f"Totale NaN nel dataset: {has_nan.sum()}")

def plot_nan_distribution_histogram(df, title="NaN Distribution per Row"):
    """
    Plotta la distribuzione del numero di NaN per riga
    """
    # Calcola quanti NaN ci sono per ogni riga
    nan_per_row = df.isnull().sum(axis=1)
    
    plt.figure(figsize=(10, 6))
    
    # Istogramma vero e proprio
    plt.hist(nan_per_row, bins=30, color='purple', alpha=0.7, edgecolor='black')
    
    plt.title(title, fontsize=16, fontweight='bold')
    plt.xlabel('Numero di NaN per riga', fontsize=12)
    plt.ylabel('Frequenza', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Statistics
    print(f"Statistiche NaN per riga:")
    print(f"Media: {nan_per_row.mean():.2f}")
    print(f"Mediana: {nan_per_row.median():.2f}")
    print(f"Massimo: {nan_per_row.max()}")
    print(f"Righe senza NaN: {(nan_per_row == 0).sum()}")

def plot_nan_histogram_all_experiments(all_datasets):
    """
    Plotta l'istogramma dei NaN per tutti gli esperimenti
    """
    experiments = list(all_datasets.keys())
    n_experiments = len(experiments)
    
    # Calcola dimensioni griglia
    if n_experiments <= 4:
        rows, cols = 2, 2
        figsize = (16, 10)
    else:
        rows = int(np.ceil(n_experiments / 2))
        cols = 2
        figsize = (16, 5*rows)
    
    fig, axes = plt.subplots(rows, cols, figsize=figsize)
    
    # Se c'è solo un subplot, converti in lista
    if n_experiments == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for i, exp_name in enumerate(experiments):
        if i < len(axes):
            ax = axes[i]
            df = all_datasets[exp_name]
            
            # Calcola NaN count
            nan_counts = df.isnull().sum()
            has_nan = nan_counts[nan_counts > 0]
            
            if len(has_nan) > 0:
                # Plot bar chart
                bars = ax.bar(range(len(has_nan)), has_nan.values, color='red', alpha=0.7)
                
                ax.set_title(f'NaN Count - {exp_name.upper()}', fontsize=14, fontweight='bold')
                ax.set_xlabel('Features', fontsize=10)
                ax.set_ylabel('NaN Count', fontsize=10)
                
                # Abbrevia i nomi delle features per leggibilità
                feature_names = []
                for name in has_nan.index:
                    if len(name) > 12:
                        # Prendi prime 12 caratteri
                        short_name = name[:12] + "..."
                    else:
                        short_name = name
                    feature_names.append(short_name)
                
                ax.set_xticks(range(len(has_nan)))
                ax.set_xticklabels(feature_names, rotation=45, ha='right', fontsize=8)
                
                ax.grid(True, alpha=0.3)
                
                # Aggiungi valori sopra le barre se sono poche
                if len(has_nan) <= 8:
                    for bar, value in zip(bars, has_nan.values):
                        ax.text(bar.get_x() + bar.get_width()/2, 
                               bar.get_height() + max(has_nan.values)*0.01,
                               f'{int(value)}', ha='center', va='bottom', fontsize=8)
                
                # Summary testuale nell'angolo
                total_nan = has_nan.sum()
                ax.text(0.02, 0.98, f'Total NaN: {total_nan}', transform=ax.transAxes, 
                       fontsize=10, verticalalignment='top', 
                       bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7))
            else:
                # Nessun NaN trovato
                ax.text(0.5, 0.5, f'No NaN\nin {exp_name.upper()}', 
                       ha='center', va='center', transform=ax.transAxes, 
                       fontsize=16, bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgreen", alpha=0.7))
                ax.set_title(f'NaN Count - {exp_name.upper()}', fontsize=14, fontweight='bold')
    
    # Nascondi subplot vuoti
    for j in range(len(experiments), len(axes)):
        axes[j].set_visible(False)
    
    plt.tight_layout()
    plt.show()
    
    # Summary generale
    print(f"\n{'='*60}")
    print("SUMMARY NaN PER ESPERIMENTO")
    print(f"{'='*60}")
    for exp_name, df in all_datasets.items():
        nan_counts = df.isnull().sum()
        has_nan = nan_counts[nan_counts > 0]
        total_nan = has_nan.sum() if len(has_nan) > 0 else 0
        features_with_nan = len(has_nan)
        
        print(f"{exp_name.upper():<15}: {total_nan:>6} NaN totali, {features_with_nan:>3} features con NaN")

# Esempi di utilizzo:

# Tutti gli esperimenti in una dashboard
plot_nan_histogram_all_experiments(all_datasets)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def normalize_0_100(series: pd.Series) -> pd.Series:
    min_val = series.min()
    max_val = series.max()
    if pd.isna(min_val) or pd.isna(max_val) or min_val == max_val:
        return pd.Series([0] * len(series), index=series.index)
    return (series - min_val) / (max_val - min_val) * 100

def process_series_for_plot(series: pd.Series, normalize: bool = True):
    if normalize:
        return normalize_0_100(series)
    else:
        return series

def calculate_cpu_cumulative(df: pd.DataFrame, metric_type: str):
    cpu_cols = [col for col in df.columns if metric_type.lower() in col.lower() 
                and 'cpu' in col.lower() and 'minutes' not in col.lower()]
    
    if not cpu_cols:
        return None
    
    return df[cpu_cols].sum(axis=1)

def calculate_tcp_non_srtt_cumulative(df: pd.DataFrame):
    tcp_cols = []
    for col in df.columns:
        col_lower = col.lower()
        if any(tcp_term in col_lower for tcp_term in ['api', 'service', 'gateway', 'customer', 'vet', 'visit']) and 'srtt' not in col_lower and 'minutes' not in col_lower:
            tcp_cols.append(col)
    
    if not tcp_cols:
        return None
    
    return df[tcp_cols].sum(axis=1)

def calculate_total_network_traffic(df: pd.DataFrame):
    srtt_cols = [col for col in df.columns if 'srtt' in col.lower() 
                 and 'minutes' not in col.lower()]
    
    if not srtt_cols:
        return None
    
    return df[srtt_cols].sum(axis=1)

def plot_cpu_cumulative_thin(df: pd.DataFrame, time_col='minutes', normalize=True):
    if time_col not in df.columns:
        return
    
    cpu_metrics = ['iowait', 'irq', 'system', 'user', 'utilization']
    colors = ['red', 'blue', 'green', 'orange', 'purple']
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    for i, metric in enumerate(cpu_metrics):
        cumulative = calculate_cpu_cumulative(df, metric)
        if cumulative is not None:
            processed_series = process_series_for_plot(cumulative, normalize)
            plt.plot(df[time_col], processed_series, 
                    label=f'CPU {metric} (cumulativo)', 
                    linewidth=1.5,
                    color=colors[i % len(colors)])
    
    title_suffix = "Normalizzate (0-100)" if normalize else "Valori Originali"
    ylabel = "Valore normalizzato (0-100)" if normalize else "Valore"
    
    plt.title(f"Metriche CPU Cumulative - {title_suffix}", fontsize=16, fontweight='bold')
    plt.xlabel("Tempo (minuti)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_memory_metrics(df: pd.DataFrame, time_col='minutes', normalize=True):
    if time_col not in df.columns:
        return
    
    mem_available = [col for col in df.columns if 'memavailable' in col.lower() 
                     and 'minutes' not in col.lower()]
    mem_util = [col for col in df.columns if 'memutil' in col.lower() 
                and 'minutes' not in col.lower()]
    mem_cache = [col for col in df.columns if 'memcache' in col.lower() 
                 and 'minutes' not in col.lower()]
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    colors = ['blue', 'green', 'orange']
    labels = ['Memory Available', 'Memory Util', 'Memory Cache']
    metric_groups = [mem_available, mem_util, mem_cache]
    
    for i, (metrics, label, color) in enumerate(zip(metric_groups, labels, colors)):
        for col in metrics:
            if col in df.columns:
                processed_series = process_series_for_plot(df[col], normalize)
                plt.plot(df[time_col], processed_series, 
                        label=label, 
                        linewidth=1.5,
                        color=color)
                break
    
    title_suffix = "Normalizzate (0-100)" if normalize else "Valori Originali"
    ylabel = "Valore normalizzato (0-100)" if normalize else "Valore"
    
    plt.title(f"Metriche Memory - {title_suffix}", fontsize=16, fontweight='bold')
    plt.xlabel("Tempo (minuti)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_tcp_network_traffic(df: pd.DataFrame, time_col='minutes', normalize=True):
    if time_col not in df.columns:
        return
    
    total_network_traffic = calculate_total_network_traffic(df)
    retransmitted_packets = calculate_tcp_non_srtt_cumulative(df)
    
    if total_network_traffic is None and retransmitted_packets is None:
        return
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    if total_network_traffic is not None:
        processed_traffic = process_series_for_plot(total_network_traffic, normalize)
        plt.plot(df[time_col], processed_traffic, 
                label='Total Network Traffic', 
                linewidth=1.5,
                color='orange')
    
    if retransmitted_packets is not None:
        processed_packets = process_series_for_plot(retransmitted_packets, normalize)
        plt.plot(df[time_col], processed_packets, 
                label='Retransmitted Packets', 
                linewidth=1.5,
                color='purple')
    
    title_suffix = "Normalized (0-100)" if normalize else "Original Values"
    ylabel = "Normalized Value (0-100)" if normalize else "Value"
    
    plt.title(f"Network Traffic Metrics - {title_suffix}", fontsize=16, fontweight='bold')
    plt.xlabel("Time (minutes)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_io_readwrite(df: pd.DataFrame, time_col='minutes', normalize=True):
    if time_col not in df.columns:
        return
    
    read_cols = [col for col in df.columns if 'readbytes' in col.lower() 
                 and 'minutes' not in col.lower()]
    write_cols = [col for col in df.columns if 'writebytes' in col.lower() 
                  and 'minutes' not in col.lower()]
    
    if not read_cols and not write_cols:
        return
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    if read_cols:
        read_cumulative = df[read_cols].sum(axis=1)
        processed_read = process_series_for_plot(read_cumulative, normalize)
        plt.plot(df[time_col], processed_read, 
                label='Read Bytes (cumulativo)', 
                linewidth=1.5,
                color='green')
    
    if write_cols:
        write_cumulative = df[write_cols].sum(axis=1)
        processed_write = process_series_for_plot(write_cumulative, normalize)
        plt.plot(df[time_col], processed_write, 
                label='Write Bytes (cumulativo)', 
                linewidth=1.5,
                color='red')
    
    title_suffix = "Normalizzate (0-100)" if normalize else "Valori Originali"
    ylabel = "Valore normalizzato (0-100)" if normalize else "Valore"
    
    plt.title(f"Metriche IO - Read/Write Bytes Cumulative - {title_suffix}", 
              fontsize=16, fontweight='bold')
    plt.xlabel("Tempo (minuti)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_dashboard(df: pd.DataFrame, time_col='minutes', normalize=True):
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    
    title_suffix = "Normalized (0-100)" if normalize else "Original Values"
    ylabel = "Normalized Value (0-100)" if normalize else "Value"
    
    ax1 = axes[0, 0]
    ax1.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    cpu_metrics = ['iowait', 'irq', 'system', 'user', 'utilization']
    colors = ['red', 'blue', 'green', 'orange', 'purple']
    
    for i, metric in enumerate(cpu_metrics):
        cumulative = calculate_cpu_cumulative(df, metric)
        if cumulative is not None:
            processed_series = process_series_for_plot(cumulative, normalize)
            ax1.plot(df[time_col], processed_series, 
                    label=f'CPU {metric}', 
                    linewidth=1.5,
                    color=colors[i % len(colors)])
    
    ax1.set_title(f"CPU Cumulative - {title_suffix}", fontsize=14, fontweight='bold')
    ax1.set_xlabel("Minutes", fontsize=10)
    ax1.set_ylabel(ylabel, fontsize=10)
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax1.grid(True, alpha=0.3)
    if normalize:
        ax1.set_ylim(0, 100)
    
    ax2 = axes[0, 1]
    ax2.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    mem_available = [col for col in df.columns if 'memavailable' in col.lower() 
                     and 'minutes' not in col.lower()]
    mem_util = [col for col in df.columns if 'memutil' in col.lower() 
                and 'minutes' not in col.lower()]
    mem_cache = [col for col in df.columns if 'memcache' in col.lower() 
                 and 'minutes' not in col.lower()]
    
    colors = ['blue', 'green', 'orange']
    labels = ['Memory Available', 'Memory Util', 'Memory Cache']
    metric_groups = [mem_available, mem_util, mem_cache]
    
    for i, (metrics, label, color) in enumerate(zip(metric_groups, labels, colors)):
        for col in metrics:
            if col in df.columns:
                processed_series = process_series_for_plot(df[col], normalize)
                ax2.plot(df[time_col], processed_series, 
                        label=label, 
                        linewidth=1.5,
                        color=color)
                break
    
    ax2.set_title(f"Memory - {title_suffix}", fontsize=14, fontweight='bold')
    ax2.set_xlabel("Minutes", fontsize=10)
    ax2.set_ylabel(ylabel, fontsize=10)
    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax2.grid(True, alpha=0.3)
    if normalize:
        ax2.set_ylim(0, 100)
    
    ax3 = axes[1, 0]
    ax3.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    total_network_traffic = calculate_total_network_traffic(df)
    retransmitted_packets = calculate_tcp_non_srtt_cumulative(df)
    
    if total_network_traffic is not None:
        processed_traffic = process_series_for_plot(total_network_traffic, normalize)
        ax3.plot(df[time_col], processed_traffic, 
                label='Total Network Traffic', 
                linewidth=1.5,
                color='orange')
    
    if retransmitted_packets is not None:
        processed_packets = process_series_for_plot(retransmitted_packets, normalize)
        ax3.plot(df[time_col], processed_packets, 
                label='Retransmitted Packets', 
                linewidth=1.5,
                color='purple')
    
    ax3.set_title(f"Network Traffic - {title_suffix}", fontsize=14, fontweight='bold')
    ax3.set_xlabel("Minutes", fontsize=10)
    ax3.set_ylabel(ylabel, fontsize=10)
    ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax3.grid(True, alpha=0.3)
    if normalize:
        ax3.set_ylim(0, 100)
    
    ax4 = axes[1, 1]
    ax4.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    read_cols = [col for col in df.columns if 'readbytes' in col.lower() 
                 and 'minutes' not in col.lower()]
    write_cols = [col for col in df.columns if 'writebytes' in col.lower() 
                  and 'minutes' not in col.lower()]
    
    if read_cols:
        read_cumulative = df[read_cols].sum(axis=1)
        processed_read = process_series_for_plot(read_cumulative, normalize)
        ax4.plot(df[time_col], processed_read, 
                label='Read Bytes', 
                linewidth=1.5,
                color='green')
    
    if write_cols:
        write_cumulative = df[write_cols].sum(axis=1)
        processed_write = process_series_for_plot(write_cumulative, normalize)
        ax4.plot(df[time_col], processed_write, 
                label='Write Bytes', 
                linewidth=1.5,
                color='red')
    
    ax4.set_title(f"IO Read/Write - {title_suffix}", fontsize=14, fontweight='bold')
    ax4.set_xlabel("Minutes", fontsize=10)
    ax4.set_ylabel(ylabel, fontsize=10)
    ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax4.grid(True, alpha=0.3)
    if normalize:
        ax4.set_ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_all_simplified_metrics(df: pd.DataFrame, time_col='minutes', normalize=True):
    plot_cpu_cumulative_thin(df, time_col, normalize)
    plot_memory_metrics(df, time_col, normalize)
    plot_tcp_network_traffic(df, time_col, normalize)
    plot_io_readwrite(df, time_col, normalize)

def compare_normalized_vs_original(df, time_col='minutes'):
    plot_dashboard(df, time_col, normalize=True)
    plot_dashboard(df, time_col, normalize=False)
plot_dashboard(all_datasets['baseline'])

# Dashboard con valori originali  
plot_dashboard(all_datasets['baseline'], normalize=False)



In [None]:
import pandas as pd
import numpy as np

def clean_dataset(df):
    df = df.copy()
    
    cpu_8_9_cols = [col for col in df.columns if 'cpu_8' in col or 'cpu_9' in col or 'cpu 8' in col or 'cpu 9' in col]
    if cpu_8_9_cols:
        df = df.drop(columns=cpu_8_9_cols)
    
    tcp_cols = [col for col in df.columns if any(term in col.lower() for term in ['api', 'service', 'gateway', 'customer', 'vet', 'visit']) and 'srtt' not in col.lower()]
    for col in tcp_cols:
        df[col] = df[col].fillna(0)
    
    other_cols = [col for col in df.columns if col not in tcp_cols and df[col].isnull().sum() > 0]
    for col in other_cols:
        df[col] = df[col].interpolate(method='linear').fillna(method='bfill').fillna(method='ffill')
    
    return df

def clean_all_datasets(all_datasets):
    processed = {}
    for name, df in all_datasets.items():
        original_shape = df.shape
        processed_df = clean_dataset(df)
        processed[name] = processed_df
        
        # Show summary
        dropped_cols = original_shape[1] - processed_df.shape[1]
        nan_count = processed_df.isnull().sum().sum()
        print(f"  {name}: {original_shape} → {processed_df.shape} | Dropped: {dropped_cols} | NaN: {nan_count}")
    
    print("✅ Done!")
    return processed


all_datasets = clean_all_datasets(all_datasets)

In [None]:
def normalize_0_100(series: pd.Series) -> pd.Series:
    """Normalizza una serie tra 0 e 100"""
    min_val = series.min()
    max_val = series.max()
    if pd.isna(min_val) or pd.isna(max_val) or min_val == max_val:
        return pd.Series([0] * len(series), index=series.index)
    return (series - min_val) / (max_val - min_val) * 100

def process_series_for_plot(series: pd.Series, normalize: bool = True):
    """Processa una serie per il plot in base alle opzioni scelte"""
    if normalize:
        return normalize_0_100(series)
    else:
        return series

def calculate_cpu_cumulative(df: pd.DataFrame, metric_type: str):
    """Calcola il cumulativo di una specifica metrica CPU attraverso tutti i core"""
    cpu_cols = [col for col in df.columns if metric_type.lower() in col.lower() 
                and 'cpu' in col.lower() and 'minutes' not in col.lower()]
    
    if not cpu_cols:
        return None
    
    return df[cpu_cols].sum(axis=1)

def calculate_tcp_non_srtt_cumulative(df: pd.DataFrame):
    """Calcola il cumulativo delle metriche TCP non-SRTT (retransmitted packets)"""
    tcp_cols = []
    for col in df.columns:
        col_lower = col.lower()
        # Cerca metriche TCP/network ma escludi SRTT e Minutes
        if any(tcp_term in col_lower for tcp_term in ['api', 'service', 'gateway', 'customer', 'vet', 'visit']) and 'srtt' not in col_lower and 'minutes' not in col_lower:
            tcp_cols.append(col)
    
    if not tcp_cols:
        return None
    
    return df[tcp_cols].sum(axis=1)

def calculate_total_network_traffic(df: pd.DataFrame):
    """Calcola il Total Network Traffic (solo SRTT)"""
    srtt_cols = [col for col in df.columns if 'srtt' in col.lower() 
                 and 'minutes' not in col.lower()]
    
    if not srtt_cols:
        return None
    
    return df[srtt_cols].sum(axis=1)

def plot_cpu_cumulative_thin(df: pd.DataFrame, time_col='minutes', normalize=True):
    """Plot CPU cumulative con linee sottili"""
    if time_col not in df.columns:
        print(f"Attenzione: colonna '{time_col}' non trovata nel DataFrame")
        return
    
    cpu_metrics = ['iowait', 'irq', 'system', 'user', 'utilization']
    colors = ['red', 'blue', 'green', 'orange', 'purple']
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    for i, metric in enumerate(cpu_metrics):
        cumulative = calculate_cpu_cumulative(df, metric)
        if cumulative is not None:
            processed_series = process_series_for_plot(cumulative, normalize)
            plt.plot(df[time_col], processed_series, 
                    label=f'CPU {metric} (cumulativo)', 
                    linewidth=1.5,
                    color=colors[i % len(colors)])
    
    title_suffix = "Normalizzate (0-100)" if normalize else "Valori Originali"
    ylabel = "Valore normalizzato (0-100)" if normalize else "Valore"
    
    plt.title(f"Metriche CPU Cumulative - {title_suffix}", fontsize=16, fontweight='bold')
    plt.xlabel("Tempo (minuti)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_memory_metrics(df: pd.DataFrame, time_col='minutes', normalize=True):
    """Plot metriche Memory: mem_available, mem_util, mem_cache"""
    if time_col not in df.columns:
        print(f"Attenzione: colonna '{time_col}' non trovata nel DataFrame")
        return
    
    mem_available = [col for col in df.columns if 'memavailable' in col.lower() 
                     and 'minutes' not in col.lower()]
    mem_util = [col for col in df.columns if 'memutil' in col.lower() 
                and 'minutes' not in col.lower()]
    mem_cache = [col for col in df.columns if 'memcache' in col.lower() 
                 and 'minutes' not in col.lower()]
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    colors = ['blue', 'green', 'orange']
    labels = ['Memory Available', 'Memory Util', 'Memory Cache']
    metric_groups = [mem_available, mem_util, mem_cache]
    
    for i, (metrics, label, color) in enumerate(zip(metric_groups, labels, colors)):
        for col in metrics:
            if col in df.columns:
                processed_series = process_series_for_plot(df[col], normalize)
                plt.plot(df[time_col], processed_series, 
                        label=label, 
                        linewidth=1.5,
                        color=color)
                break
    
    title_suffix = "Normalizzate (0-100)" if normalize else "Valori Originali"
    ylabel = "Valore normalizzato (0-100)" if normalize else "Valore"
    
    plt.title(f"Metriche Memory - {title_suffix}", fontsize=16, fontweight='bold')
    plt.xlabel("Tempo (minuti)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_tcp_network_traffic(df: pd.DataFrame, time_col='minutes', normalize=True):
    """Plot Network Traffic: Total Network Traffic (SRTT) e Retransmitted Packets"""
    if time_col not in df.columns:
        print(f"Attenzione: colonna '{time_col}' non trovata nel DataFrame")
        return
    
    # Calcola Total Network Traffic (SRTT)
    total_network_traffic = calculate_total_network_traffic(df)
    
    # Calcola Retransmitted Packets (TCP non-SRTT)
    retransmitted_packets = calculate_tcp_non_srtt_cumulative(df)
    
    if total_network_traffic is None and retransmitted_packets is None:
        print("Nessuna metrica TCP/Network trovata")
        return
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    # Plot Total Network Traffic (SRTT) se disponibile
    if total_network_traffic is not None:
        processed_traffic = process_series_for_plot(total_network_traffic, normalize)
        plt.plot(df[time_col], processed_traffic, 
                label='Total Network Traffic', 
                linewidth=1.5,
                color='orange')
    
    # Plot Retransmitted Packets se disponibile
    if retransmitted_packets is not None:
        processed_packets = process_series_for_plot(retransmitted_packets, normalize)
        plt.plot(df[time_col], processed_packets, 
                label='Retransmitted Packets', 
                linewidth=1.5,
                color='purple')
    
    title_suffix = "Normalized (0-100)" if normalize else "Original Values"
    ylabel = "Normalized Value (0-100)" if normalize else "Value"
    
    plt.title(f"Network Traffic Metrics - {title_suffix}", fontsize=16, fontweight='bold')
    plt.xlabel("Time (minutes)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_io_readwrite(df: pd.DataFrame, time_col='minutes', normalize=True):
    """Plot IO: readbytes e writebytes insieme"""
    if time_col not in df.columns:
        print(f"Attenzione: colonna '{time_col}' non trovata nel DataFrame")
        return
    
    read_cols = [col for col in df.columns if 'readbytes' in col.lower() 
                 and 'minutes' not in col.lower()]
    write_cols = [col for col in df.columns if 'writebytes' in col.lower() 
                  and 'minutes' not in col.lower()]
    
    if not read_cols and not write_cols:
        print("Nessuna metrica readbytes/writebytes trovata")
        return
    
    plt.figure(figsize=(14, 8))
    plt.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    if read_cols:
        read_cumulative = df[read_cols].sum(axis=1)
        processed_read = process_series_for_plot(read_cumulative, normalize)
        plt.plot(df[time_col], processed_read, 
                label='Read Bytes (cumulativo)', 
                linewidth=1.5,
                color='green')
    
    if write_cols:
        write_cumulative = df[write_cols].sum(axis=1)
        processed_write = process_series_for_plot(write_cumulative, normalize)
        plt.plot(df[time_col], processed_write, 
                label='Write Bytes (cumulativo)', 
                linewidth=1.5,
                color='red')
    
    title_suffix = "Normalizzate (0-100)" if normalize else "Valori Originali"
    ylabel = "Valore normalizzato (0-100)" if normalize else "Valore"
    
    plt.title(f"Metriche IO - Read/Write Bytes Cumulative - {title_suffix}", 
              fontsize=16, fontweight='bold')
    plt.xlabel("Tempo (minuti)", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True, alpha=0.3)
    
    if normalize:
        plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_dashboard(df: pd.DataFrame, time_col='minutes', normalize=True):
    """Plot dashboard 2x2 con opzione di normalizzazione"""
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    
    title_suffix = "Normalized (0-100)" if normalize else "Original Values"
    ylabel = "Normalized Value (0-100)" if normalize else "Value"
    
    # CPU Plot
    ax1 = axes[0, 0]
    ax1.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    cpu_metrics = ['iowait', 'irq', 'system', 'user', 'utilization']
    colors = ['red', 'blue', 'green', 'orange', 'purple']
    
    for i, metric in enumerate(cpu_metrics):
        cumulative = calculate_cpu_cumulative(df, metric)
        if cumulative is not None:
            processed_series = process_series_for_plot(cumulative, normalize)
            ax1.plot(df[time_col], processed_series, 
                    label=f'CPU {metric}', 
                    linewidth=1.5,
                    color=colors[i % len(colors)])
    
    ax1.set_title(f"CPU Cumulative - {title_suffix}", fontsize=14, fontweight='bold')
    ax1.set_xlabel("Minutes", fontsize=10)
    ax1.set_ylabel(ylabel, fontsize=10)
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax1.grid(True, alpha=0.3)
    if normalize:
        ax1.set_ylim(0, 100)
    
    # Memory Plot
    ax2 = axes[0, 1]
    ax2.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    mem_available = [col for col in df.columns if 'memavailable' in col.lower() 
                     and 'minutes' not in col.lower()]
    mem_util = [col for col in df.columns if 'memutil' in col.lower() 
                and 'minutes' not in col.lower()]
    mem_cache = [col for col in df.columns if 'memcache' in col.lower() 
                 and 'minutes' not in col.lower()]
    
    colors = ['blue', 'green', 'orange']
    labels = ['Memory Available', 'Memory Util', 'Memory Cache']
    metric_groups = [mem_available, mem_util, mem_cache]
    
    for i, (metrics, label, color) in enumerate(zip(metric_groups, labels, colors)):
        for col in metrics:
            if col in df.columns:
                processed_series = process_series_for_plot(df[col], normalize)
                ax2.plot(df[time_col], processed_series, 
                        label=label, 
                        linewidth=1.5,
                        color=color)
                break
    
    ax2.set_title(f"Memory - {title_suffix}", fontsize=14, fontweight='bold')
    ax2.set_xlabel("Minutes", fontsize=10)
    ax2.set_ylabel(ylabel, fontsize=10)
    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax2.grid(True, alpha=0.3)
    if normalize:
        ax2.set_ylim(0, 100)
    
    # Network Traffic Plot
    ax3 = axes[1, 0]
    ax3.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    # Total Network Traffic (SRTT)
    total_network_traffic = calculate_total_network_traffic(df)
    
    # Retransmitted Packets (TCP non-SRTT)
    retransmitted_packets = calculate_tcp_non_srtt_cumulative(df)
    
    if total_network_traffic is not None:
        processed_traffic = process_series_for_plot(total_network_traffic, normalize)
        ax3.plot(df[time_col], processed_traffic, 
                label='Total Network Traffic', 
                linewidth=1.5,
                color='orange')
    
    if retransmitted_packets is not None:
        processed_packets = process_series_for_plot(retransmitted_packets, normalize)
        ax3.plot(df[time_col], processed_packets, 
                label='Retransmitted Packets', 
                linewidth=1.5,
                color='purple')
    
    ax3.set_title(f"Network Traffic - {title_suffix}", fontsize=14, fontweight='bold')
    ax3.set_xlabel("Minutes", fontsize=10)
    ax3.set_ylabel(ylabel, fontsize=10)
    ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax3.grid(True, alpha=0.3)
    if normalize:
        ax3.set_ylim(0, 100)
    
    # IO Plot
    ax4 = axes[1, 1]
    ax4.axvspan(30, 80, alpha=0.2, color='red', label='Stress')
    
    read_cols = [col for col in df.columns if 'readbytes' in col.lower() 
                 and 'minutes' not in col.lower()]
    write_cols = [col for col in df.columns if 'writebytes' in col.lower() 
                  and 'minutes' not in col.lower()]
    
    if read_cols:
        read_cumulative = df[read_cols].sum(axis=1)
        processed_read = process_series_for_plot(read_cumulative, normalize)
        ax4.plot(df[time_col], processed_read, 
                label='Read Bytes', 
                linewidth=1.5,
                color='green')
    
    if write_cols:
        write_cumulative = df[write_cols].sum(axis=1)
        processed_write = process_series_for_plot(write_cumulative, normalize)
        ax4.plot(df[time_col], processed_write, 
                label='Write Bytes', 
                linewidth=1.5,
                color='red')
    
    ax4.set_title(f"IO Read/Write - {title_suffix}", fontsize=14, fontweight='bold')
    ax4.set_xlabel("Minutes", fontsize=10)
    ax4.set_ylabel(ylabel, fontsize=10)
    ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax4.grid(True, alpha=0.3)
    if normalize:
        ax4.set_ylim(0, 100)
    
    plt.tight_layout()
    plt.show()

def plot_all_simplified_metrics(df: pd.DataFrame, time_col='minutes', normalize=True):
    """Plot tutti i 4 grafici in sequenza"""
    norm_text = "normalizzati" if normalize else "valori originali"
    print(f"Plotting grafici con {norm_text}...")
    
    print("1. CPU Cumulative Metrics:")
    plot_cpu_cumulative_thin(df, time_col, normalize)
    
    print("\n2. Memory Metrics:")
    plot_memory_metrics(df, time_col, normalize)
    
    print("\n3. Network Traffic Metrics:")
    plot_tcp_network_traffic(df, time_col, normalize)
    
    print("\n4. IO Read/Write Bytes:")
    plot_io_readwrite(df, time_col, normalize)

def compare_normalized_vs_original(df, time_col='minutes'):
    """Confronta grafici normalizzati vs originali"""
    print("=== DASHBOARD NORMALIZZATA ===")
    plot_dashboard(df, time_col, normalize=True)
    
    print("\n=== DASHBOARD VALORI ORIGINALI ===")
    plot_dashboard(df, time_col, normalize=False)

# Esempi di utilizzo migliorati:

# Dashboard normalizzata (default)
plot_dashboard(all_datasets['baseline'])

# Dashboard con valori originali  
plot_dashboard(all_datasets['baseline'], normalize=False)



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def super_clean_heatmap(data, title="Clean Correlation Heatmap", show_labels=True):
    """
    Ultra-clean heatmap that removes problematic columns
    """
    # Get numeric data
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Remove time column
    if 'minutes' in numeric_data.columns:
        numeric_data = numeric_data.drop(columns=['minutes'])
    
    # Remove CPU 8/9 columns
    cpu_8_9_cols = [col for col in numeric_data.columns if 'cpu_8' in col or 'cpu_9' in col or 'cpu 8' in col or 'cpu 9' in col]
    if cpu_8_9_cols:
        numeric_data = numeric_data.drop(columns=cpu_8_9_cols)
        print(f"🗑️ Removed {len(cpu_8_9_cols)} CPU 8/9 columns")
    
    # Remove columns that are all zeros or have no variance
    zero_variance_cols = []
    for col in numeric_data.columns:
        if numeric_data[col].var() == 0 or numeric_data[col].sum() == 0:
            zero_variance_cols.append(col)
    
    if zero_variance_cols:
        numeric_data = numeric_data.drop(columns=zero_variance_cols)
        print(f"🗑️ Removed {len(zero_variance_cols)} zero-variance columns")
    
    # Remove columns with too many NaN values
    nan_threshold = len(numeric_data) * 0.5  # Remove if >50% NaN
    high_nan_cols = [col for col in numeric_data.columns if numeric_data[col].isnull().sum() > nan_threshold]
    
    if high_nan_cols:
        numeric_data = numeric_data.drop(columns=high_nan_cols)
        print(f"🗑️ Removed {len(high_nan_cols)} high-NaN columns")
    
    print(f"📊 Final dataset: {len(numeric_data.columns)} features")
    
    # Fill remaining NaN with 0
    numeric_data = numeric_data.fillna(0)
    
    # Create correlation matrix
    corr = numeric_data.corr()
    
    # Remove NaN correlations
    corr = corr.fillna(0)
    
    # Create clean heatmap with proper sizing for labels
    if show_labels:
        fig, ax = plt.subplots(figsize=(20, 18))
    else:
        fig, ax = plt.subplots(figsize=(16, 14))
    
    sns.heatmap(corr, 
                cmap='RdYlBu_r', 
                center=0,
                square=True,
                cbar_kws={"shrink": .6},
                xticklabels=show_labels,
                yticklabels=show_labels,
                ax=ax)
    
    ax.set_title(f'{title} ({len(corr)} features)', fontsize=16, pad=20)
    
    if show_labels:
        # Rotate labels for better readability
        plt.xticks(rotation=45, ha='right', fontsize=9)
        plt.yticks(rotation=0, fontsize=9)
        plt.subplots_adjust(bottom=0.2, left=0.2)
    else:
        ax.set_xlabel('')
        ax.set_ylabel('')
    
    plt.tight_layout()
    plt.show()
    
    return corr

def show_removed_columns(data):
    """
    Show what columns would be removed
    """
    numeric_data = data.select_dtypes(include=[np.number])
    
    print("🔍 COLUMN ANALYSIS:")
    print("=" * 40)
    
    # CPU 8/9
    cpu_8_9 = [col for col in numeric_data.columns if 'cpu_8' in col or 'cpu_9' in col]
    print(f"CPU 8/9 columns: {len(cpu_8_9)}")
    
    # Zero variance
    zero_var = [col for col in numeric_data.columns if numeric_data[col].var() == 0 or numeric_data[col].sum() == 0]
    print(f"Zero variance: {len(zero_var)}")
    if zero_var:
        print(f"  Examples: {zero_var[:3]}")
    
    # High NaN
    nan_threshold = len(numeric_data) * 0.5
    high_nan = [col for col in numeric_data.columns if numeric_data[col].isnull().sum() > nan_threshold]
    print(f"High NaN (>50%): {len(high_nan)}")
    if high_nan:
        print(f"  Examples: {high_nan[:3]}")
    
    total_remove = len(cpu_8_9) + len(zero_var) + len(high_nan)
    final_count = len(numeric_data.columns) - total_remove
    print(f"\nFinal features: {final_count}")

def quick_analysis(data):
    """
    Quick data quality check
    """
    numeric_data = data.select_dtypes(include=[np.number])
    
    print("📊 DATA QUALITY CHECK:")
    print("-" * 30)
    print(f"Total columns: {len(numeric_data.columns)}")
    print(f"Total NaN: {numeric_data.isnull().sum().sum()}")
    print(f"Columns with NaN: {(numeric_data.isnull().sum() > 0).sum()}")
    print(f"Zero-sum columns: {(numeric_data.sum() == 0).sum()}")

# QUICK FUNCTIONS
def labeled_heatmap(data):
    """Quick heatmap WITH labels"""
    return super_clean_heatmap(data, show_labels=True)

def clean_heatmap(data):
    """Quick heatmap WITHOUT labels"""
    return super_clean_heatmap(data, show_labels=False)

# USAGE EXAMPLES
print("🎨 SUPER CLEAN HEATMAP READY!")
print("=" * 35)
print("1. With labels (detailed):")
print("   labeled_heatmap(all_datasets['baseline'])")
print("")
print("2. Without labels (clean):")
print("   clean_heatmap(all_datasets['baseline'])")
print("")
print("3. Custom:")
print("   super_clean_heatmap(data, show_labels=True)")

labeled_heatmap(all_datasets['baseline'])
clean_heatmap(all_datasets['baseline'])

## feature engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

class TimeSeriesFeatureEngineer:
    def __init__(self, data, time_col='minutes'):
        self.data = data.copy()
        self.time_col = time_col
        self.features = data.copy()
        
        self._clean_data()
        self.features = self.features.sort_values(time_col).reset_index(drop=True)
        
        # Identify metric categories
        numeric_cols = self.features.select_dtypes(include=[np.number]).columns
        self.io_metrics = [col for col in numeric_cols if 'io_' in col.lower()]
        self.cpu_metrics = [col for col in numeric_cols if 'cpu_' in col.lower()]
        self.memory_metrics = [col for col in numeric_cols if 'memory_' in col.lower()]
        self.network_metrics = [col for col in numeric_cols if 'network_' in col.lower()]
    
    def _clean_data(self):
        # Handle non-numeric columns
        non_numeric_cols = self.features.select_dtypes(exclude=[np.number]).columns
        
        for col in non_numeric_cols:
            if col == self.time_col:
                continue
            
            unique_values = self.features[col].unique()
            if len(unique_values) <= 10:
                le = LabelEncoder()
                self.features[f'{col}_encoded'] = le.fit_transform(self.features[col].astype(str))
            
            if col != self.time_col:
                self.features = self.features.drop(columns=[col])
        
        # Convert object columns to numeric
        for col in self.features.columns:
            if col == self.time_col:
                continue
            if self.features[col].dtype == 'object':
                try:
                    self.features[col] = pd.to_numeric(self.features[col], errors='coerce')
                except:
                    self.features = self.features.drop(columns=[col])
        
        # Fill NaN values
        numeric_cols = self.features.select_dtypes(include=[np.number]).columns
        self.features[numeric_cols] = self.features[numeric_cols].fillna(0)
    
    def create_temporal_features(self):
        self.features['time_index'] = range(len(self.features))
        self.features['time_minutes'] = self.features[self.time_col]
        self.features['hour_of_day'] = (self.features[self.time_col] % 1440) / 60
        self.features['minute_of_hour'] = self.features[self.time_col] % 60
        
        # Cyclical encoding
        self.features['hour_sin'] = np.sin(2 * np.pi * self.features['hour_of_day'] / 24)
        self.features['hour_cos'] = np.cos(2 * np.pi * self.features['hour_of_day'] / 24)
        self.features['minute_sin'] = np.sin(2 * np.pi * self.features['minute_of_hour'] / 60)
        self.features['minute_cos'] = np.cos(2 * np.pi * self.features['minute_of_hour'] / 60)
        
        max_time = self.features[self.time_col].max()
        self.features['time_normalized'] = self.features[self.time_col] / max_time
        return self
    
    def create_lag_features(self, columns=None, lags=[1, 2, 3, 5, 10]):
        if columns is None:
            columns = []
            if self.cpu_metrics:
                columns.extend([col for col in self.cpu_metrics if 'utilization' in col][:3])
            if self.memory_metrics:
                columns.extend([col for col in self.memory_metrics if 'util' in col][:2])
            if self.io_metrics:
                columns.extend([col for col in self.io_metrics if 'bytes' in col][:2])
        
        existing_columns = [col for col in columns if col in self.features.columns]
        
        for col in existing_columns:
            for lag in lags:
                self.features[f"lag_{lag}_{col}"] = self.features[col].shift(lag)
        return self
    
    def create_rolling_features(self, columns=None, windows=[3, 5, 10, 20], stats=['mean', 'std', 'min', 'max']):
        if columns is None:
            columns = []
            if self.cpu_metrics:
                columns.extend([col for col in self.cpu_metrics if 'utilization' in col][:3])
            if self.memory_metrics:
                columns.extend([col for col in self.memory_metrics if 'util' in col][:1])
        
        existing_columns = [col for col in columns if col in self.features.columns]
        
        for col in existing_columns:
            for window in windows:
                for stat in stats:
                    col_name = f"rolling_{window}_{stat}_{col}"
                    if stat == 'mean':
                        self.features[col_name] = self.features[col].rolling(window=window, min_periods=1).mean()
                    elif stat == 'std':
                        self.features[col_name] = self.features[col].rolling(window=window, min_periods=1).std()
                    elif stat == 'min':
                        self.features[col_name] = self.features[col].rolling(window=window, min_periods=1).min()
                    elif stat == 'max':
                        self.features[col_name] = self.features[col].rolling(window=window, min_periods=1).max()
        return self
    
    def create_diff_features(self, columns=None, periods=[1, 2, 5]):
        if columns is None:
            columns = []
            if self.cpu_metrics:
                columns.extend([col for col in self.cpu_metrics if 'utilization' in col][:3])
            if self.memory_metrics:
                columns.extend([col for col in self.memory_metrics if 'util' in col][:1])
        
        existing_columns = [col for col in columns if col in self.features.columns]
        
        for col in existing_columns:
            for period in periods:
                self.features[f"diff_{period}_{col}"] = self.features[col].diff(periods=period)
                pct_change = self.features[col].pct_change(periods=period)
                pct_change = pct_change.replace([np.inf, -np.inf], 0).fillna(0)
                self.features[f"pct_change_{period}_{col}"] = pct_change
        return self
    
    def create_aggregated_features(self):
        if self.cpu_metrics:
            cpu_data = self.features[self.cpu_metrics]
            self.features['cpu_total'] = cpu_data.sum(axis=1, skipna=True)
            self.features['cpu_avg'] = cpu_data.mean(axis=1, skipna=True)
            self.features['cpu_max'] = cpu_data.max(axis=1, skipna=True)
            self.features['cpu_std'] = cpu_data.std(axis=1, skipna=True)
        
        if self.memory_metrics:
            memory_data = self.features[self.memory_metrics]
            self.features['memory_total'] = memory_data.sum(axis=1, skipna=True)
            self.features['memory_avg'] = memory_data.mean(axis=1, skipna=True)
        
        if self.io_metrics:
            io_data = self.features[self.io_metrics]
            self.features['io_total'] = io_data.sum(axis=1, skipna=True)
            self.features['io_avg'] = io_data.mean(axis=1, skipna=True)
        return self
    
    def create_interaction_features(self, max_interactions=10):
        key_metrics = []
        if self.cpu_metrics:
            key_metrics.extend([col for col in self.cpu_metrics if 'utilization' in col][:3])
        if self.memory_metrics:
            key_metrics.extend([col for col in self.memory_metrics if 'util' in col][:1])
        if self.io_metrics:
            key_metrics.extend([col for col in self.io_metrics if 'bytes' in col][:2])
        
        key_metrics = [col for col in key_metrics if col in self.features.columns]
        
        interaction_count = 0
        for i, col1 in enumerate(key_metrics):
            for col2 in key_metrics[i+1:]:
                if interaction_count >= max_interactions:
                    break
                
                self.features[f"interaction_{col1}_x_{col2}"] = self.features[col1] * self.features[col2]
                col2_safe = self.features[col2].replace(0, 1e-8)
                self.features[f"ratio_{col1}_div_{col2}"] = self.features[col1] / col2_safe
                interaction_count += 2
        return self
    
    def create_statistical_features(self, columns=None, windows=[5, 10, 20]):
        if columns is None:
            columns = []
            if self.cpu_metrics:
                columns.extend([col for col in self.cpu_metrics if 'utilization' in col][:2])
            if self.memory_metrics:
                columns.extend([col for col in self.memory_metrics if 'util' in col][:1])
        
        for col in columns:
            if col in self.features.columns:
                for window in windows:
                    self.features[f"skew_{window}_{col}"] = (
                        self.features[col].rolling(window=window, min_periods=3)
                        .apply(lambda x: stats.skew(x, nan_policy='omit'), raw=True)
                    )
                    self.features[f"kurt_{window}_{col}"] = (
                        self.features[col].rolling(window=window, min_periods=3)
                        .apply(lambda x: stats.kurtosis(x, nan_policy='omit'), raw=True)
                    )
                    self.features[f"q25_{window}_{col}"] = (
                        self.features[col].rolling(window=window, min_periods=1).quantile(0.25)
                    )
                    self.features[f"q75_{window}_{col}"] = (
                        self.features[col].rolling(window=window, min_periods=1).quantile(0.75)
                    )
        return self
    
    def create_trend_features(self, columns=None, windows=[5, 10, 20]):
        if columns is None:
            columns = []
            if self.cpu_metrics:
                columns.extend([col for col in self.cpu_metrics if 'utilization' in col][:2])
        
        for col in columns:
            if col in self.features.columns:
                for window in windows:
                    def calculate_slope(y):
                        if len(y) < 2:
                            return 0
                        x = np.arange(len(y))
                        try:
                            slope, _, _, _, _ = stats.linregress(x, y)
                            return slope
                        except:
                            return 0
                    
                    self.features[f"trend_{window}_{col}"] = (
                        self.features[col].rolling(window=window, min_periods=2)
                        .apply(calculate_slope, raw=True)
                    )
                    
                    if window >= 3:
                        self.features[f"momentum_{window}_{col}"] = (
                            self.features[col].diff().rolling(window=window-1, min_periods=1).mean()
                        )
        return self
    
    def create_all_features(self):
        self.create_temporal_features()
        self.create_lag_features()
        self.create_rolling_features()
        self.create_diff_features()
        self.create_aggregated_features()
        self.create_interaction_features()
        self.create_statistical_features()
        self.create_trend_features()
        
        # Clean final features
        numeric_cols = self.features.select_dtypes(include=[np.number]).columns
        self.features[numeric_cols] = self.features[numeric_cols].fillna(method='ffill').fillna(method='bfill')
        self.features[numeric_cols] = self.features[numeric_cols].fillna(0)
        self.features[numeric_cols] = self.features[numeric_cols].replace([np.inf, -np.inf], 0)
        
        return self
    
    def select_features(self, target_col, k=50):
        if target_col not in self.features.columns:
            return [], None
        
        numeric_cols = self.features.select_dtypes(include=[np.number]).columns
        feature_cols = [col for col in numeric_cols if col not in [target_col, self.time_col]]
        
        if len(feature_cols) == 0:
            return [], None
        
        X = self.features[feature_cols]
        y = self.features[target_col]
        
        X = X.replace([np.inf, -np.inf], np.nan).fillna(0)
        y = y.replace([np.inf, -np.inf], np.nan).fillna(0)
        
        try:
            selector = SelectKBest(score_func=f_regression, k=min(k, len(feature_cols)))
            selector.fit(X, y)
            selected_features = [feature_cols[i] for i in selector.get_support(indices=True)]
            return selected_features, selector.scores_
        except:
            correlations = X.corrwith(y).abs().sort_values(ascending=False)
            selected_features = correlations.head(k).index.tolist()
            return selected_features, correlations.values
    
    def plot_feature_importance(self, target_col, top_n=20):
        selected_features, scores = self.select_features(target_col, k=top_n*2)
        
        if len(selected_features) == 0 or scores is None:
            return
        
        numeric_cols = self.features.select_dtypes(include=[np.number]).columns
        feature_cols = [col for col in numeric_cols if col not in [target_col, self.time_col]]
        
        if len(feature_cols) > 0:
            importance_df = pd.DataFrame({
                'feature': feature_cols[:len(scores)],
                'importance': scores[:len(feature_cols)]
            }).sort_values('importance', ascending=False).head(top_n)
            
            plt.figure(figsize=(12, 8))
            sns.barplot(data=importance_df, x='importance', y='feature')
            plt.title(f'Top {top_n} Features for {target_col}')
            plt.xlabel('F-score')
            plt.tight_layout()
            plt.show()

def create_forecasting_dataset(fe, target_metric, selected_features, forecast_horizon=5):
    if len(selected_features) == 0:
        return None
    
    forecast_data = fe.features.copy()
    forecast_data[f'{target_metric}_target'] = forecast_data[target_metric].shift(-forecast_horizon)
    
    feature_columns = selected_features + [fe.time_col, target_metric]
    existing_columns = [col for col in feature_columns if col in forecast_data.columns]
    
    modeling_data = forecast_data[existing_columns + [f'{target_metric}_target']].copy()
    modeling_data = modeling_data.dropna()
    
    if len(modeling_data) == 0:
        return None
    
    feature_only_cols = [col for col in existing_columns if col not in [fe.time_col, target_metric]]
    X = modeling_data[feature_only_cols]
    y = modeling_data[f'{target_metric}_target']
    time_index = modeling_data[fe.time_col]
    
    split_idx = max(1, int(len(X) * 0.8))
    
    return {
        'X_train': X.iloc[:split_idx],
        'X_val': X.iloc[split_idx:],
        'y_train': y.iloc[:split_idx],
        'y_val': y.iloc[split_idx:],
        'time_train': time_index.iloc[:split_idx],
        'time_val': time_index.iloc[split_idx:],
        'feature_names': X.columns.tolist()
    }

def find_best_target(fe):
    """Find the best target metric automatically"""
    # Priority order: CPU utilization, memory utilization, other numeric
    cpu_cols = [col for col in fe.features.columns if 'cpu_utilization' in col and 'cpu 0' in col]
    if cpu_cols:
        return cpu_cols[0]
    
    cpu_cols = [col for col in fe.features.columns if 'cpu' in col and 'utilization' in col]
    if cpu_cols:
        return cpu_cols[0]
    
    memory_cols = [col for col in fe.features.columns if 'memory' in col and 'util' in col]
    if memory_cols:
        return memory_cols[0]
    
    numeric_cols = fe.features.select_dtypes(include=[np.number]).columns
    candidates = [col for col in numeric_cols if col != fe.time_col and fe.features[col].var() > 0]
    
    return candidates[0] if candidates else None

# MAIN EXECUTION - READY TO RUN
def run_feature_engineering(data=None):
    """Main function to run the complete pipeline"""
    
    # Load data
    if data is None:
        try:
            data = all_datasets['baseline']
        except:
            print("❌ Please provide data or ensure all_datasets['baseline'] exists")
            return None
    
    print(f"📊 Dataset: {data.shape[0]} samples, {data.shape[1]} columns")
    
    # Initialize feature engineer
    fe = TimeSeriesFeatureEngineer(data, time_col='minutes')
    
    # Create all features
    fe.create_all_features()
    print(f"🔧 Created {len(fe.features.columns)} total features")
    
    # Find best target
    target_metric = find_best_target(fe)
    if not target_metric:
        print("❌ No suitable target metric found")
        return None
    
    print(f"🎯 Target: {target_metric}")
    
    # Select features
    selected_features, _ = fe.select_features(target_metric, k=30)
    print(f"✅ Selected {len(selected_features)} features")
    
    # Create forecasting dataset
    forecast_dataset = create_forecasting_dataset(fe, target_metric, selected_features)
    
    if forecast_dataset:
        print(f"📈 Training: {len(forecast_dataset['X_train'])} samples")
        print(f"📈 Validation: {len(forecast_dataset['X_val'])} samples")
        print("✅ Ready for machine learning!")
        
        # Show feature importance
        fe.plot_feature_importance(target_metric, top_n=15)
        
        return {
            'fe': fe,
            'target_metric': target_metric,
            'selected_features': selected_features,
            'forecast_dataset': forecast_dataset
        }
    else:
        print("❌ Failed to create forecasting dataset")
        return None

# Run everything
if __name__ == "__main__":
    result = run_feature_engineering()
    
    if result:
        # Access results
        fe = result['fe']
        forecast_dataset = result['forecast_dataset']
        target_metric = result['target_metric']
        
        print(f"\n🎉 SUCCESS! Feature engineering complete.")
        print(f"📋 Available variables:")
        print(f"   - fe: Feature engineer object")
        print(f"   - forecast_dataset: Ready-to-use ML dataset")
        print(f"   - target_metric: '{target_metric}'")
        print(f"\n🚀 Next steps:")
        print(f"   - Train models on forecast_dataset['X_train'], forecast_dataset['y_train']")
        print(f"   - Validate on forecast_dataset['X_val'], forecast_dataset['y_val']")

LSTM

In [None]:

# Set working directory
os.chdir('/home/alessandro/PGFDS/results/tuna/customers')

class OptimizedFailureDetector:
    """
    Detector ottimizzato con:
    1. Soglia ottimale automatica
    2. Bilanciamento classi
    3. Focal Loss per imbalanced data
    """
    
    def __init__(self, sequence_length=20, lstm_units=64, dropout_rate=0.3, 
                 use_focal_loss=True, use_class_weights=True):
        self.sequence_length = sequence_length
        self.lstm_units = lstm_units
        self.dropout_rate = dropout_rate
        self.scaler = StandardScaler()
        self.model = None
        
        # Configurazioni ottimizzazione
        self.use_focal_loss = use_focal_loss
        self.use_class_weights = use_class_weights
        
        # Finestre adattive
        self.failure_start_percent = 0.2
        self.failure_end_percent = 0.6
        
        # Soglia ottimale (sarà calcolata automaticamente)
        self.optimal_threshold = 0.5
        
    def focal_loss(self, alpha=0.75, gamma=2.0):
        """
        FOCAL LOSS - Spiegazione dettagliata:
        
        Il problema principale è che abbiamo molti più esempi "normali" che "failure".
        Durante il training, il modello si "abitua" a predire sempre "normale" perché
        è statisticamente più probabile avere ragione.
        
        La Focal Loss risolve questo in 2 modi:
        
        1. ALPHA WEIGHTING:
           - alpha=0.75 significa che i failure (classe minority) hanno peso 3x maggiore
           - I failure "costano" di più quando sbagliati
           - Il modello è "forzato" a prestare più attenzione ai failure
        
        2. GAMMA FOCUSING:
           - gamma=2.0 riduce il peso degli esempi "facili"
           - Se il modello è già sicuro al 95%, non impara molto
           - Concentra l'apprendimento sugli esempi difficili/ambigui
        
        Matematicamente:
        FL(p_t) = -alpha * (1-p_t)^gamma * log(p_t)
        
        Dove p_t è la probabilità corretta per la classe vera.
        """
        def focal_loss_fixed(y_true, y_pred):
            # Evita log(0) che darebbe infinito
            epsilon = tf.keras.backend.epsilon()
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
            
            # p_t = probabilità della classe corretta
            # Se y_true=1 (failure), p_t=y_pred
            # Se y_true=0 (normal), p_t=1-y_pred
            p_t = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
            
            # Alpha weighting: più peso alla classe minority (failure)
            alpha_factor = tf.ones_like(y_true) * alpha
            alpha_t = tf.where(tf.equal(y_true, 1), alpha_factor, 1 - alpha_factor)
            
            # Standard cross entropy
            cross_entropy = -tf.math.log(p_t)
            
            # Focusing term: (1-p_t)^gamma
            # Se p_t è alto (esempio facile), il peso è basso
            # Se p_t è basso (esempio difficile), il peso è alto
            weight = alpha_t * tf.pow((1 - p_t), gamma)
            
            # Loss finale
            loss = weight * cross_entropy
            return tf.reduce_mean(loss)
        
        return focal_loss_fixed
    
    def load_and_label_data(self, unified_dir="unified"):
        """
        Carica dati con finestra failure adattiva
        """
        all_data = []
        
        print("📊 Loading data with adaptive failure windows...")
        
        for exp_dir in os.listdir(unified_dir):
            if not os.path.isdir(os.path.join(unified_dir, exp_dir)):
                continue
                
            csv_file = os.path.join(unified_dir, exp_dir, f"all_metrics_combined_{exp_dir}.csv")
            
            if os.path.exists(csv_file):
                print(f"Loading {exp_dir}...")
                df = pd.read_csv(csv_file)
                
                # Gestione time/minutes
                if 'time' in df.columns:
                    df['minutes'] = pd.to_numeric(df['time'], errors='coerce')
                    if df['minutes'].isna().any():
                        df['minutes'] = np.arange(len(df))
                    df = df.drop(columns=['time'])
                else:
                    df['minutes'] = np.arange(len(df))
                
                duration = df['minutes'].max() - df['minutes'].min()
                
                # Labeling adattivo
                if exp_dir == 'baseline':
                    df['is_failure'] = 0
                    print(f"  Baseline: all normal")
                else:
                    failure_start = df['minutes'].min() + (duration * self.failure_start_percent)
                    failure_end = df['minutes'].min() + (duration * self.failure_end_percent)
                    
                    df['is_failure'] = ((df['minutes'] >= failure_start) & 
                                       (df['minutes'] <= failure_end)).astype(int)
                    
                    failure_count = df['is_failure'].sum()
                    print(f"  {exp_dir}: {failure_count}/{len(df)} failures ({failure_count/len(df)*100:.1f}%)")
                
                df['experiment'] = exp_dir
                all_data.append(df)
        
        return pd.concat(all_data, ignore_index=True)
    
    def prepare_features(self, df):
        """
        Prepara features escludendo colonne temporali
        """
        feature_columns = df.select_dtypes(include=[np.number]).columns
        feature_columns = [col for col in feature_columns 
                          if col not in ['minutes', 'time', 'is_failure', 'Timestamp', 'timestamp']]
        
        print(f"Using {len(feature_columns)} features")
        return df[feature_columns].fillna(0).values, feature_columns
    
    def create_sequences(self, X, y, experiments):
        """
        Crea sequenze temporali per esperimento
        """
        X_sequences = []
        y_sequences = []
        experiment_info = []
        
        for exp_name in set(experiments):
            exp_mask = experiments == exp_name
            X_exp = X[exp_mask]
            y_exp = y[exp_mask]
            
            for i in range(len(X_exp) - self.sequence_length + 1):
                sequence = X_exp[i:i + self.sequence_length]
                label = y_exp[i + self.sequence_length - 1]
                
                X_sequences.append(sequence)
                y_sequences.append(label)
                experiment_info.append(exp_name)
        
        return np.array(X_sequences), np.array(y_sequences), experiment_info
    
    def calculate_class_weights(self, y_train):
        """
        CLASS WEIGHTS - Spiegazione dettagliata:
        
        Il problema: Se hai 1000 esempi "normali" e 100 esempi "failure",
        il modello impara che "predire sempre normale" gli dà 90% accuracy.
        
        La soluzione: Dare peso maggiore alla classe minority.
        
        Calcolo automatico:
        - Se hai 90% normali e 10% failure
        - Peso normale = 1 / (2 * 0.9) = 0.56
        - Peso failure = 1 / (2 * 0.1) = 5.0
        
        Risultato: Un errore su failure "costa" 9x di più che un errore su normale.
        Il modello è incentivato a non ignorare i failure.
        """
        # Calcola pesi bilanciati automaticamente
        classes = np.unique(y_train)
        class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
        
        class_weight_dict = {}
        for i, cls in enumerate(classes):
            class_weight_dict[cls] = class_weights[i]
        
        print(f"\n⚖️  CLASS WEIGHTS CALCULATED:")
        for cls, weight in class_weight_dict.items():
            class_name = "Normal" if cls == 0 else "Failure"
            print(f"   {class_name}: {weight:.3f}")
        
        failure_weight = class_weight_dict.get(1, 1.0)
        normal_weight = class_weight_dict.get(0, 1.0)
        ratio = failure_weight / normal_weight
        
        print(f"   Failure examples are weighted {ratio:.1f}x more than normal")
        print(f"   This compensates for class imbalance")
        
        return class_weight_dict
    
    def build_optimized_model(self, input_shape):
        """
        Costruisce modello con focal loss o binary crossentropy
        """
        model = Sequential([
            LSTM(self.lstm_units, return_sequences=True, input_shape=input_shape),
            BatchNormalization(),
            Dropout(self.dropout_rate),
            
            LSTM(self.lstm_units // 2, return_sequences=False),
            BatchNormalization(),
            Dropout(self.dropout_rate),
            
            Dense(32, activation='relu'),
            Dropout(self.dropout_rate / 2),
            
            Dense(16, activation='relu'),
            
            Dense(1, activation='sigmoid')
        ])
        
        # Scelta della loss function
        if self.use_focal_loss:
            print(f"\n🎯 Using FOCAL LOSS (alpha=0.75, gamma=2.0)")
            print(f"   This will focus learning on hard examples and minority class")
            loss_function = self.focal_loss(alpha=0.75, gamma=2.0)
        else:
            print(f"\n📊 Using standard BINARY CROSSENTROPY")
            loss_function = 'binary_crossentropy'
        
        model.compile(
            optimizer='adam',
            loss=loss_function,
            metrics=['accuracy']
        )
        
        self.model = model
        return model
    
    def train_optimized_model(self, X_train, y_train, X_val, y_val, epochs=30):
        """
        Training con class weights opzionali
        """
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
        ]
        
        # Calcola class weights se richiesto
        if self.use_class_weights:
            class_weight_dict = self.calculate_class_weights(y_train)
            print(f"\n🏋️ Training with CLASS WEIGHTS")
        else:
            class_weight_dict = None
            print(f"\n🏋️ Training WITHOUT class weights")
        
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=32,
            callbacks=callbacks,
            class_weight=class_weight_dict,  # Qui viene applicato il bilanciamento
            verbose=1
        )
        
        return history
    
    def find_optimal_threshold(self, X_val, y_val):
        """
        OPTIMAL THRESHOLD - Spiegazione dettagliata:
        
        Il problema: La soglia di default 0.5 è arbitraria.
        Non considera il costo relativo dei diversi tipi di errore.
        
        La soluzione: Trova la soglia che ottimizza il trade-off precision/recall.
        
        Processo:
        1. Calcola precision/recall per tutte le soglie possibili (0.0 to 1.0)
        2. Per ogni soglia, calcola F1-score = 2 * (precision * recall) / (precision + recall)
        3. Scegli la soglia che massimizza F1-score
        
        Perché F1? Bilancia precision e recall:
        - Precision alta = pochi falsi allarmi
        - Recall alto = pochi failure mancati
        - F1 alto = buon compromesso tra entrambi
        
        Alternative:
        - Se i falsi allarmi costano molto: ottimizza precision
        - Se i failure mancati costano molto: ottimizza recall
        - Per sistemi critici: spesso si preferisce recall alto
        """
        print(f"\n🔍 FINDING OPTIMAL THRESHOLD...")
        
        # Ottieni probabilità su validation set
        y_pred_proba = self.model.predict(X_val, verbose=0).flatten()
        
        # Calcola precision/recall per tutte le soglie
        precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
        
        # Calcola F1-score per ogni soglia
        # Evita divisione per zero
        f1_scores = []
        for p, r in zip(precision, recall):
            if p + r == 0:
                f1_scores.append(0)
            else:
                f1_scores.append(2 * (p * r) / (p + r))
        
        f1_scores = np.array(f1_scores)
        
        # Trova soglia ottimale
        optimal_idx = np.argmax(f1_scores)
        self.optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
        optimal_f1 = f1_scores[optimal_idx]
        optimal_precision = precision[optimal_idx]
        optimal_recall = recall[optimal_idx]
        
        print(f"   Default threshold (0.5):")
        default_predictions = (y_pred_proba > 0.5).astype(int)
        default_precision = np.sum((default_predictions == 1) & (y_val == 1)) / max(np.sum(default_predictions == 1), 1)
        default_recall = np.sum((default_predictions == 1) & (y_val == 1)) / max(np.sum(y_val == 1), 1)
        default_f1 = 2 * (default_precision * default_recall) / max(default_precision + default_recall, 1e-8)
        
        print(f"     Precision: {default_precision:.3f}, Recall: {default_recall:.3f}, F1: {default_f1:.3f}")
        
        print(f"   Optimal threshold ({self.optimal_threshold:.3f}):")
        print(f"     Precision: {optimal_precision:.3f}, Recall: {optimal_recall:.3f}, F1: {optimal_f1:.3f}")
        
        improvement = ((optimal_f1 - default_f1) / default_f1 * 100) if default_f1 > 0 else 0
        print(f"   Improvement: {improvement:+.1f}% F1-score")
        
        # Plot precision-recall curve
        plt.figure(figsize=(10, 4))
        
        plt.subplot(1, 2, 1)
        plt.plot(thresholds, precision[:-1], 'b-', label='Precision')
        plt.plot(thresholds, recall[:-1], 'r-', label='Recall')
        plt.axvline(self.optimal_threshold, color='green', linestyle='--', label=f'Optimal ({self.optimal_threshold:.3f})')
        plt.axvline(0.5, color='orange', linestyle='--', label='Default (0.5)')
        plt.xlabel('Threshold')
        plt.ylabel('Score')
        plt.title('Precision/Recall vs Threshold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        plt.plot(recall, precision, 'b-')
        plt.scatter(optimal_recall, optimal_precision, color='green', s=100, 
                   label=f'Optimal (F1={optimal_f1:.3f})')
        plt.scatter(default_recall, default_precision, color='orange', s=100,
                   label=f'Default (F1={default_f1:.3f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        return self.optimal_threshold
    
    def evaluate_with_optimal_threshold(self, X_test, y_test):
        """
        Valutazione usando soglia ottimale
        """
        print(f"\n📊 EVALUATION WITH OPTIMAL THRESHOLD")
        print("="*60)
        
        # Predizioni con soglia ottimale
        y_pred_proba = self.model.predict(X_test, verbose=0).flatten()
        y_pred_optimal = (y_pred_proba > self.optimal_threshold).astype(int)
        y_pred_default = (y_pred_proba > 0.5).astype(int)
        
        # Calcola metriche per entrambe le soglie
        from sklearn.metrics import precision_score, recall_score, f1_score
        
        print(f"\n🔄 COMPARISON: Default vs Optimal Threshold")
        print("-" * 50)
        
        # Default threshold (0.5)
        acc_default = accuracy_score(y_test, y_pred_default)
        prec_default = precision_score(y_test, y_pred_default, zero_division=0)
        rec_default = recall_score(y_test, y_pred_default, zero_division=0)
        f1_default = f1_score(y_test, y_pred_default, zero_division=0)
        
        print(f"Default (0.5):     Acc={acc_default:.3f}, Prec={prec_default:.3f}, Rec={rec_default:.3f}, F1={f1_default:.3f}")
        
        # Optimal threshold
        acc_optimal = accuracy_score(y_test, y_pred_optimal)
        prec_optimal = precision_score(y_test, y_pred_optimal, zero_division=0)
        rec_optimal = recall_score(y_test, y_pred_optimal, zero_division=0)
        f1_optimal = f1_score(y_test, y_pred_optimal, zero_division=0)
        
        print(f"Optimal ({self.optimal_threshold:.3f}): Acc={acc_optimal:.3f}, Prec={prec_optimal:.3f}, Rec={rec_optimal:.3f}, F1={f1_optimal:.3f}")
        
        # Calcola miglioramenti
        print(f"\n📈 IMPROVEMENTS:")
        print(f"   Accuracy:  {acc_optimal - acc_default:+.3f}")
        print(f"   Precision: {prec_optimal - prec_default:+.3f}")
        print(f"   Recall:    {rec_optimal - rec_default:+.3f} ⭐")
        print(f"   F1-Score:  {f1_optimal - f1_default:+.3f}")
        
        # AUC
        if len(np.unique(y_test)) > 1:
            auc_score = roc_auc_score(y_test, y_pred_proba)
            print(f"   AUC:       {auc_score:.3f} (unchanged - depends on ranking)")
        
        # Confusion matrices
        cm_default = confusion_matrix(y_test, y_pred_default)
        cm_optimal = confusion_matrix(y_test, y_pred_optimal)
        
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        sns.heatmap(cm_default, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Normal', 'Failure'],
                   yticklabels=['Normal', 'Failure'])
        plt.title('Default Threshold (0.5)')
        
        plt.subplot(1, 2, 2)
        sns.heatmap(cm_optimal, annot=True, fmt='d', cmap='Reds',
                   xticklabels=['Normal', 'Failure'],
                   yticklabels=['Normal', 'Failure'])
        plt.title(f'Optimal Threshold ({self.optimal_threshold:.3f})')
        
        plt.tight_layout()
        plt.show()
        
        return acc_optimal, prec_optimal, rec_optimal, f1_optimal

def create_failure_aware_split(X_seq, y_seq, exp_info, test_ratio=0.3):
    """
    Split che garantisce failure nel test set
    """
    train_indices = []
    test_indices = []
    
    for exp in set(exp_info):
        exp_mask = np.array(exp_info) == exp
        exp_indices = np.where(exp_mask)[0]
        exp_labels = y_seq[exp_indices]
        
        failure_indices = exp_indices[exp_labels == 1]
        normal_indices = exp_indices[exp_labels == 0]
        
        if len(failure_indices) > 0:
            # Mantieni failure in entrambi train e test
            n_failure_test = max(1, int(len(failure_indices) * test_ratio))
            failure_test = failure_indices[-n_failure_test:]
            failure_train = failure_indices[:-n_failure_test]
            
            n_normal_test = int(len(normal_indices) * test_ratio)
            normal_test = normal_indices[-n_normal_test:]
            normal_train = normal_indices[:-n_normal_test]
            
            train_indices.extend(failure_train)
            train_indices.extend(normal_train)
            test_indices.extend(failure_test)
            test_indices.extend(normal_test)
        else:
            # Solo normale (baseline)
            n_test = int(len(exp_indices) * test_ratio)
            test_indices.extend(exp_indices[-n_test:])
            train_indices.extend(exp_indices[:-n_test])
    
    return train_indices, test_indices

def main_optimized():
    """
    Training completo con tutte le ottimizzazioni
    """
    print("=== OPTIMIZED FAILURE DETECTION TRAINING ===\n")
    
    # Testa diverse configurazioni
    configurations = [
        {"name": "Baseline", "focal_loss": False, "class_weights": False},
        {"name": "Class Weights Only", "focal_loss": False, "class_weights": True},
        {"name": "Focal Loss Only", "focal_loss": True, "class_weights": False},
        {"name": "Full Optimization", "focal_loss": True, "class_weights": True},
    ]
    
    results = {}
    
    for config in configurations:
        print(f"\n{'='*80}")
        print(f"🧪 TESTING CONFIGURATION: {config['name'].upper()}")
        print(f"{'='*80}")
        
        # Inizializza detector con configurazione
        detector = OptimizedFailureDetector(
            sequence_length=20, 
            lstm_units=64,
            use_focal_loss=config['focal_loss'],
            use_class_weights=config['class_weights']
        )
        
        # Carica e prepara dati (solo una volta)
        if 'df' not in locals():
            df = detector.load_and_label_data("unified")
            X, feature_columns = detector.prepare_features(df)
            y = df['is_failure'].values
            experiments = df['experiment'].values
            
            X_scaled = detector.scaler.fit_transform(X)
            X_seq, y_seq, exp_info = detector.create_sequences(X_scaled, y, experiments)
            
            # Split
            train_idx, test_idx = create_failure_aware_split(X_seq, y_seq, exp_info)
            X_train = X_seq[train_idx]
            y_train = y_seq[train_idx]
            X_test = X_seq[test_idx]
            y_test = y_seq[test_idx]
            
            # Validation split
            val_split = int(len(X_train) * 0.8)
            X_train_final = X_train[:val_split]
            y_train_final = y_train[:val_split]
            X_val = X_train[val_split:]
            y_val = y_train[val_split:]
            
            print(f"\nDataset: {len(X_seq)} sequences, {y_seq.sum()} failures")
            print(f"Splits: Train={len(X_train_final)}, Val={len(X_val)}, Test={len(X_test)}")
        
        # Costruisci e addestra modello
        model = detector.build_optimized_model((X_seq.shape[1], X_seq.shape[2]))
        
        print(f"\n🏋️ Training {config['name']} model...")
        history = detector.train_optimized_model(
            X_train_final, y_train_final, 
            X_val, y_val, 
            epochs=25
        )
        
        # Trova soglia ottimale
        optimal_threshold = detector.find_optimal_threshold(X_val, y_val)
        
        # Valuta con soglia ottimale
        acc, prec, rec, f1 = detector.evaluate_with_optimal_threshold(X_test, y_test)
        
        # Salva risultati
        results[config['name']] = {
            'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1,
            'threshold': optimal_threshold
        }
        
        # Salva modello
        model_name = f"optimized_detector_{config['name'].lower().replace(' ', '_')}.keras"
        detector.model.save(model_name)
        print(f"✅ Saved: {model_name}")
    
    # Confronto finale
    print(f"\n{'='*80}")
    print(f"🏆 FINAL COMPARISON OF ALL CONFIGURATIONS")
    print(f"{'='*80}")
    
    comparison_df = pd.DataFrame(results).T
    print(comparison_df.round(3))
    
    # Trova migliore configurazione
    best_config = comparison_df['f1'].idxmax()
    best_f1 = comparison_df['f1'].max()
    
    print(f"\n🥇 BEST CONFIGURATION: {best_config}")
    print(f"   F1-Score: {best_f1:.3f}")
    print(f"   Recall: {comparison_df.loc[best_config, 'recall']:.3f}")
    print(f"   Precision: {comparison_df.loc[best_config, 'precision']:.3f}")
    
    return results, detector

# Esegui training ottimizzato
print("Starting optimized training with all techniques...")
results, best_detector = main_optimized()