In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from rich import print
from rich.panel import Panel
from rich.console import Console
from rich.table import Table
import matplotlib.pyplot as plt

In [2]:
# Load and preprocess data
def preprocess_data(df):
    # Create copy to avoid modifying original data
    df = df.copy()
    
    # Split features and target
    X = df.drop(['is_benign', 'attack', 'category'], axis=1)
    y = df['is_benign']
    
    return X, y

In [None]:
def evaluate_model(y_true, y_pred, model_name="Model"):
    """
    Print simplified but comprehensive model evaluation metrics.
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    model_name : str, optional
        Name of the model for display purposes
    """
    console = Console()
    
    # Calculate core metrics
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average='weighted'),
        "Recall": recall_score(y_true, y_pred, average='weighted'),
        "F1 Score": f1_score(y_true, y_pred, average='weighted')
    }
    
    # Create metrics table
    table = Table(show_header=True, header_style="bold white", box=None)
    table.add_column("Metric", style="cyan")
    table.add_column("Score", justify="right")
    
    # Add metrics rows with color coding
    for metric, value in metrics.items():
        color = "green" if value > 0.8 else "yellow" if value > 0.6 else "red"
        table.add_row(
            metric,
            f"[{color}]{value:.5f}[/{color}]"
        )
    
    # Calculate confusion matrix summary
    cm = confusion_matrix(y_true, y_pred)
    
    # Create confusion matrix table
    cm_table = Table(show_header=True, header_style="bold white", box=None)
    cm_table.add_column("Guess \\ OG", style="cyan")
    for i in range(cm.shape[1]):
        cm_table.add_column(f"Class {i}", justify="right")
    
    for i in range(cm.shape[0]):
        row = [f"Class {i}"] + [str(cm[i, j]) for j in range(cm.shape[1])]
        cm_table.add_row(*row)
    
    # Create a combined panel with two columns
    combined_table = Table.grid(expand=True)
    combined_table.add_column(justify="center", ratio=1)
    combined_table.add_column(justify="center", ratio=1)
    
    combined_table.add_row(table, cm_table)
    
    # Create and display panel
    panel = Panel(
        combined_table,
        title=f"[bold]{model_name} - Performance Metrics and Confusion Matrix[/bold]",
        border_style="white"
    )
    
    console.print("\n", panel, "\n")
    return metrics

In [3]:
def train_xgboost(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Initial XGBoost model with balanced class weights
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
        random_state=42,
        n_estimators=100,
        learning_rate=0.1
    )
    
    # Define parameter grid for optimization
    param_grid = {
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.8, 0.9]
    }
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Evaluate the model
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Evaluate the model
    metrics = evaluate_model(y_test, y_pred, "XGBoost")
    
    return best_model

In [None]:
# Load your CSV file
df = pd.read_csv('/kaggle/input/dataset/train_binary.csv')

# Preprocess the data
X, y = preprocess_data(df)

# Train and evaluate the model
best_model= train_xgboost(X, y)
best_model.save_model('xgboost_network_traffic_model.json')