In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import xgboost as xgb
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from typing import Tuple
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
def preprocess_data_multiclass(df: pd.DataFrame, column: str = 'category') -> Tuple[pd.DataFrame, pd.DataFrame, LabelEncoder]:
    """
    Preprocesses the input DataFrame for a multiclass classification task.
    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str, optional): The target column. Defaults to 'category'.
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, LabelEncoder]: Training features, validation features, training labels, validation labels, and the LabelEncoder.
    """
    df = df.copy()

    # Encode categorical target
    le = LabelEncoder()
    y = le.fit_transform(df[column])

    # Drop unnecessary columns
    X = df.drop(['is_benign', 'attack', 'category'], axis=1)
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=37)

    return X_train, X_val, y_train, y_val, le

def evaluate_model(y_true, y_pred, model_name="Model", class_names=None):
    """
    Print comprehensive model evaluation metrics with both rich text output and seaborn heatmap.
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    model_name : str, optional
        Name of the model for display purposes
    class_names : list, optional
        List of class names for axis labels
    """
    console = Console()
    
    # Calculate core metrics
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average='weighted'),
        "Recall": recall_score(y_true, y_pred, average='weighted'),
        "F1 Score": f1_score(y_true, y_pred, average='weighted')
    }
    
    # Create metrics table
    table = Table(show_header=True, header_style="bold white", box=None)
    table.add_column("Metric", style="cyan")
    table.add_column("Score", justify="right")
    
    # Add metrics rows with color coding
    for metric, value in metrics.items():
        color = "green" if value > 0.8 else "yellow" if value > 0.6 else "red"
        table.add_row(
            metric,
            f"[{color}]{value:.5f}[/{color}]"
        )
    
    # Calculate and plot confusion matrix as heatmap
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    # Create heatmap
    plt.figure(figsize=(20, 10))
    if class_names is None:
        class_names = [f"Class {i}" for i in range(cm.shape[0])]
    
    sns.heatmap(
        cm_normalized,
        annot=True,
        fmt='.2f',
        cmap='Blues',
        xticklabels=class_names,
        yticklabels=class_names
    )
    plt.title(f'{model_name} - Normalized Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    
    # Display metrics table
    console.print("\n")
    panel = Panel(
        table,
        title=f"[bold]{model_name} - Performance Metrics[/bold]",
        border_style="white"
    )
    console.print(panel)
    console.print("\n")
    
    # Show the plot
    plt.tight_layout()
    plt.show()
    
    return metrics

def train_xgboost(df):

    X_train, X_test, y_train, y_test, le = preprocess_data_multiclass(df)
    
    model = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(np.unique(y)),
        random_state=42,
        n_estimators=100,
        learning_rate=0.1
    )
    
    param_grid = {
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.8, 0.9]
    }
    
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    evaluate_model(y_test, y_pred, le, "XGBoost Multiclass", le.classes_)
    
    return best_model

In [None]:
# Usage
df = pd.read_csv('/kaggle/input/dataset/train_sel_hclust.csv')

best_model = train_xgboost(df)
best_model.save_model('xgboost_multiclass.json')