# KDD99 Decision Tree Analysis with Scikit-Learn

**Comparing Custom Implementation with SKLearn**

This notebook implements the same KDD99 decision tree analysis using scikit-learn's optimized implementation, allowing us to compare performance and results with our custom CART implementation.


## Imports and Setup


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import time
import warnings
warnings.filterwarnings('ignore')

print("üöÄ KDD99 Decision Tree Analysis with Scikit-Learn")
print("="*60)


## Data Loading and Preprocessing

Loading the same KDD99 dataset used in the original analysis.


In [None]:
# Load the KDD99 dataset
from tensorflow.keras.utils import get_file

try:
    path = get_file('kddcup.data_10_percent.gz',
                    origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
except:
    print('Error downloading')
    raise

print(f"Dataset path: {path}")


In [None]:
# Load and setup DataFrame
pd_data_frame = pd.read_csv(path, header=None)

# Add column names
pd_data_frame.columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome'
]

print(f"üìä Dataset original: {pd_data_frame.shape}")
print(f"üìä Columnas: {len(pd_data_frame.columns)}")


In [None]:
# Clean data
pd_data_frame.dropna(inplace=True, axis=1)
pd_data_frame.drop_duplicates(keep='first', inplace=True)

# Filter for normal and backdoor only
filtered_df = pd_data_frame[pd_data_frame['outcome'].isin(['normal.', 'back.'])].copy()
print(f"üìä Dataset filtrado (normal + backdoor): {filtered_df.shape}")

# Show class distribution
print(f"üìä Distribuci√≥n de clases:")
print(filtered_df['outcome'].value_counts())


In [None]:
# One-hot encoding for categorical features
list_nominal_features = ["flag", "protocol_type", "service"]
df_encoded = pd.get_dummies(filtered_df, columns=list_nominal_features)

# Convert boolean columns to integers
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'bool':
        df_encoded[col] = df_encoded[col].astype(int)

print(f"üìä Dataset despu√©s de encoding: {df_encoded.shape}")
print(f"üìä Distribuci√≥n de clases final:")
print(df_encoded['outcome'].value_counts())


## Data Preparation for SKLearn


In [None]:
def prepare_sklearn_dataset(df_encoded):
    """
    Prepara los datos para scikit-learn
    """
    # Separar features y target
    X = df_encoded.drop('outcome', axis=1)
    y = df_encoded['outcome']
    
    # Codificar target: normal. -> 0, back. -> 1
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    print(f"üî¢ Features shape: {X.shape}")
    print(f"üî¢ Target distribution: {np.bincount(y_encoded)}")
    print(f"üî¢ Class mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
    
    return X, y_encoded, label_encoder, X.columns.tolist()

X, y, label_encoder, feature_names = prepare_sklearn_dataset(df_encoded)


## Single Evaluation Function with SKLearn


In [None]:
def evaluate_sklearn_tree(X, y, max_depth, min_samples_leaf=2, test_size=0.3, random_state=42):
    """
    Eval√∫a un √°rbol de decisi√≥n de sklearn
    """
    # Train-test split si test_size > 0, sino usa todo el dataset
    if test_size > 0:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
    else:
        X_train, X_test, y_train, y_test = X, X, y, y
    
    # Crear modelo
    clf = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state,
        criterion='gini'  # Equivalente a nuestro Gini personalizado
    )
    
    # Entrenar
    start_time = time.time()
    clf.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Predecir
    start_time = time.time()
    y_pred = clf.predict(X_test)
    evaluation_time = time.time() - start_time
    
    # M√©tricas
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    return {
        'model': clf,
        'accuracy': accuracy,
        'f1_score': f1,
        'training_time': training_time,
        'evaluation_time': evaluation_time,
        'train_size': len(X_train),
        'test_size': len(X_test),
        'y_test': y_test,
        'y_pred': y_pred
    }


## Part 1: Same Dataset for Training and Testing

Equivalent to our custom implementation's first evaluation.


In [None]:
print("="*60)
print("PART 1: SAME DATASET FOR TRAINING AND TESTING")
print("="*60)

results_part1 = {}

for max_depth in [3, 4]:
    print(f"\nüå≥ Evaluating Decision Tree with max_depth={max_depth}")
    
    result = evaluate_sklearn_tree(X, y, max_depth=max_depth, min_samples_leaf=2, test_size=0)
    results_part1[max_depth] = result
    
    print(f"   üìä Results:")
    print(f"      ‚Ä¢ Accuracy: {result['accuracy']:.4f} ({result['accuracy']*100:.2f}%)")
    print(f"      ‚Ä¢ F1-Score (macro): {result['f1_score']:.4f}")
    print(f"      ‚Ä¢ Training time: {result['training_time']:.4f} seconds")
    print(f"      ‚Ä¢ Evaluation time: {result['evaluation_time']:.4f} seconds")
    print(f"      ‚Ä¢ Tree depth: {result['model'].get_depth()}")
    print(f"      ‚Ä¢ Number of leaves: {result['model'].get_n_leaves()}")


In [None]:
# Comparison table Part 1
print(f"\nüìä COMPARISON TABLE - PART 1")
print("-" * 50)
print(f"{'Metric':<20} {'Depth 3':<15} {'Depth 4':<15}")
print("-" * 50)
print(f"{'Accuracy':<20} {results_part1[3]['accuracy']:<15.4f} {results_part1[4]['accuracy']:<15.4f}")
print(f"{'F1-Score':<20} {results_part1[3]['f1_score']:<15.4f} {results_part1[4]['f1_score']:<15.4f}")
print(f"{'Train Time (s)':<20} {results_part1[3]['training_time']:<15.4f} {results_part1[4]['training_time']:<15.4f}")
print(f"{'Eval Time (s)':<20} {results_part1[3]['evaluation_time']:<15.4f} {results_part1[4]['evaluation_time']:<15.4f}")
print(f"{'Tree Depth':<20} {results_part1[3]['model'].get_depth():<15} {results_part1[4]['model'].get_depth():<15}")
print(f"{'Num Leaves':<20} {results_part1[3]['model'].get_n_leaves():<15} {results_part1[4]['model'].get_n_leaves():<15}")


## Part 2: 10 Random Splits (70%-30%)

Equivalent to our custom implementation's second evaluation with cross-validation.


In [None]:
def multiple_runs_sklearn(X, y, max_depth, n_runs=10, min_samples_leaf=2):
    """
    Ejecuta m√∫ltiples evaluaciones con particiones aleatorias
    """
    results = []
    
    print(f"üîÑ Running {n_runs} evaluations with max_depth={max_depth}")
    
    for run in range(n_runs):
        result = evaluate_sklearn_tree(
            X, y, 
            max_depth=max_depth, 
            min_samples_leaf=min_samples_leaf,
            test_size=0.3, 
            random_state=42+run
        )
        results.append(result)
        print(f"   Run {run+1:2d}: Acc={result['accuracy']:.3f}, F1={result['f1_score']:.3f}")
    
    # Statistics
    accuracies = [r['accuracy'] for r in results]
    f1_scores = [r['f1_score'] for r in results]
    train_times = [r['training_time'] for r in results]
    eval_times = [r['evaluation_time'] for r in results]
    
    # Find best run by F1-score
    best_idx = np.argmax(f1_scores)
    
    return {
        'accuracy_mean': np.mean(accuracies),
        'accuracy_std': np.std(accuracies),
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores),
        'train_time_mean': np.mean(train_times),
        'train_time_std': np.std(train_times),
        'eval_time_mean': np.mean(eval_times),
        'eval_time_std': np.std(eval_times),
        'best_idx': best_idx,
        'best_model': results[best_idx]['model'],
        'best_f1': f1_scores[best_idx],
        'all_results': results
    }


In [None]:
print("="*60)
print("PART 2: 10 RANDOM SPLITS (70% TRAIN - 30% TEST)")
print("="*60)

# Execute multiple runs
results_part2 = {}
for max_depth in [2, 3]:
    results_part2[max_depth] = multiple_runs_sklearn(X, y, max_depth=max_depth, n_runs=10)


In [None]:
# Results table Part 2
print(f"\nüìä RESULTS TABLE - PART 2 (Mean ¬± Std)")
print("="*70)
print(f"{'Metric':<20} {'Depth 2':<25} {'Depth 3':<25}")
print("-"*70)

for metric in ['accuracy', 'f1', 'train_time', 'eval_time']:
    depth2_mean = results_part2[2][f'{metric}_mean']
    depth2_std = results_part2[2][f'{metric}_std']
    depth3_mean = results_part2[3][f'{metric}_mean']
    depth3_std = results_part2[3][f'{metric}_std']
    
    metric_name = metric.replace('_', ' ').title()
    if 'time' in metric:
        metric_name += ' (s)'
    
    print(f"{metric_name:<20} {depth2_mean:.4f} ¬± {depth2_std:.4f:<12} {depth3_mean:.4f} ¬± {depth3_std:.4f}")

# Best runs summary
print(f"\nüèÜ BEST RUNS SUMMARY:")
for depth in [2, 3]:
    best_idx = results_part2[depth]['best_idx']
    best_f1 = results_part2[depth]['best_f1']
    print(f"   ‚Ä¢ Depth {depth}: Run {best_idx+1} with F1-Score = {best_f1:.4f}")


## Tree Visualization

Visualizing the best decision trees from our analysis.


In [None]:
def visualize_best_tree(model, feature_names, title, max_depth):
    """
    Visualiza el mejor √°rbol de decisi√≥n
    """
    plt.figure(figsize=(20, 10))
    plot_tree(model, 
             feature_names=feature_names,
             class_names=['Normal', 'Backdoor'],
             filled=True,
             rounded=True,
             fontsize=10)
    plt.title(f'{title} (Max Depth: {max_depth})', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Also print text representation
    print(f"\nüìù Text representation of {title}:")
    print("-" * 50)
    tree_rules = export_text(model, feature_names=feature_names, show_weights=True)
    print(tree_rules[:1000] + "..." if len(tree_rules) > 1000 else tree_rules)

print(f"\nüå≥ TREE VISUALIZATIONS")
print("="*50)

# Visualize best trees from Part 2
for depth in [2, 3]:
    best_model = results_part2[depth]['best_model']
    best_f1 = results_part2[depth]['best_f1']
    visualize_best_tree(
        best_model, 
        feature_names, 
        f'Best Tree Depth {depth} (F1={best_f1:.3f})', 
        depth
    )


## Feature Importance Analysis

Analyzing which features are most important for the decision trees.


In [None]:
def analyze_feature_importance(model, feature_names, title):
    """
    Analiza la importancia de las caracter√≠sticas
    """
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    print(f"\nüìä Feature Importance - {title}")
    print("-" * 40)
    for i in range(min(10, len(indices))):  # Top 10 features
        idx = indices[i]
        print(f"{i+1:2d}. {feature_names[idx]:<30} {importances[idx]:.4f}")
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = min(15, len(indices))
    plt.barh(range(top_features), importances[indices[:top_features]])
    plt.yticks(range(top_features), [feature_names[indices[i]] for i in range(top_features)])
    plt.xlabel('Feature Importance')
    plt.title(f'Top {top_features} Feature Importances - {title}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    return indices, importances

# Analyze feature importance for best models
feature_importance_results = {}
for depth in [2, 3]:
    indices, importances = analyze_feature_importance(
        results_part2[depth]['best_model'], 
        feature_names, 
        f'Depth {depth}'
    )
    feature_importance_results[depth] = {'indices': indices, 'importances': importances}


## Performance Comparison and Analysis

Analysis of advantages and differences between SKLearn and custom implementation.


In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    """
    Plotea la matriz de confusi√≥n
    """
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Backdoor'],
                yticklabels=['Normal', 'Backdoor'])
    plt.title(f'Confusion Matrix - {title}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Print classification report
    print(f"\nüìä Classification Report - {title}")
    print("-" * 40)
    print(classification_report(y_true, y_pred, target_names=['Normal', 'Backdoor']))

# Show confusion matrices for best models
print("üîç CONFUSION MATRIX ANALYSIS")
print("="*40)

for depth in [2, 3]:
    best_result = results_part2[depth]['all_results'][results_part2[depth]['best_idx']]
    plot_confusion_matrix(
        best_result['y_test'], 
        best_result['y_pred'], 
        f'Best Depth {depth} Model'
    )


In [None]:
print(f"‚öñÔ∏è  COMPARISON: SKLEARN vs CUSTOM IMPLEMENTATION")
print("="*60)

print(f"üîç Key Observations:")
print(f"   ‚Ä¢ SKLearn trees are typically faster due to optimized C implementation")
print(f"   ‚Ä¢ Both should achieve similar accuracy/F1 scores with same parameters")
print(f"   ‚Ä¢ SKLearn provides built-in feature importance and visualization")
print(f"   ‚Ä¢ Custom implementation gives more control over splitting criteria")

print(f"\nüí° Advantages of SKLearn:")
print(f"   ‚úÖ Much faster training and inference")
print(f"   ‚úÖ Built-in pruning and optimization")
print(f"   ‚úÖ Extensive visualization tools")
print(f"   ‚úÖ Feature importance calculation")
print(f"   ‚úÖ Well-tested and optimized")
print(f"   ‚úÖ Cross-validation utilities")

print(f"\nüí° Advantages of Custom Implementation:")
print(f"   ‚úÖ Full control over splitting logic")
print(f"   ‚úÖ Custom stopping criteria")
print(f"   ‚úÖ Educational value - understanding internals")
print(f"   ‚úÖ Ability to modify Gini calculation")
print(f"   ‚úÖ Custom node features and XML export")
print(f"   ‚úÖ Integration with PyTorch tensors")


## Final Results Summary

Comprehensive summary of the analysis and conclusions.


In [None]:
print(f"\nüéØ FINAL RESULTS SUMMARY")
print("="*50)

# Find overall best configuration
best_depth = 2 if results_part2[2]['f1_mean'] > results_part2[3]['f1_mean'] else 3
best_stats = results_part2[best_depth]

print(f"üèÜ BEST OVERALL CONFIGURATION:")
print(f"   ‚Ä¢ Best Depth: {best_depth}")
print(f"   ‚Ä¢ Mean Accuracy: {best_stats['accuracy_mean']:.4f} ¬± {best_stats['accuracy_std']:.4f}")
print(f"   ‚Ä¢ Mean F1-Score: {best_stats['f1_mean']:.4f} ¬± {best_stats['f1_std']:.4f}")
print(f"   ‚Ä¢ Mean Training Time: {best_stats['train_time_mean']:.4f}s ¬± {best_stats['train_time_std']:.4f}s")

print(f"\nüìä COMPARISON BETWEEN DEPTHS:")
acc_diff = abs(results_part2[3]['accuracy_mean'] - results_part2[2]['accuracy_mean'])
f1_diff = abs(results_part2[3]['f1_mean'] - results_part2[2]['f1_mean'])
time_diff = abs(results_part2[3]['train_time_mean'] - results_part2[2]['train_time_mean'])

print(f"   ‚Ä¢ Accuracy difference: {acc_diff:.4f}")
print(f"   ‚Ä¢ F1-Score difference: {f1_diff:.4f}")
print(f"   ‚Ä¢ Training time difference: {time_diff:.4f}s")

print(f"\nüí° OPTIMIZATION PROPOSAL WITH JENSEN-SHANNON DISTANCE:")
print("-" * 60)
print(f"The Jensen-Shannon distance from Part 1 analysis could be used for:")
print(f"   1. üéØ Pre-select most discriminative features before training")
print(f"   2. ‚ö° Reduce search space in feature selection")
print(f"   3. üîç Prioritize splits on features with higher class separability")
print(f"   4. ‚öôÔ∏è  Implement early stopping based on low JS distances")
print(f"   5. üìä Use JS as alternative criterion to Gini for more informative splits")

print(f"\nüéâ CONCLUSIONS:")
print(f"   ‚Ä¢ SKLearn Decision Trees provide excellent performance on KDD99 dataset")
print(f"   ‚Ä¢ {best_depth}-depth trees show optimal balance of performance vs complexity")
print(f"   ‚Ä¢ Feature importance analysis reveals key discriminative features")
print(f"   ‚Ä¢ Results are comparable to custom implementation with better efficiency")

print(f"\n‚úÖ SKLearn Decision Tree Analysis Complete!")
print(f"üìÑ This notebook demonstrates how the same KDD99 analysis can be")
print(f"   performed using scikit-learn's optimized implementation.")
