# Advanced SVM Threat Detection Model - Target: 0.99 Accuracy
## Network Flow Threat Classification using Advanced Machine Learning Techniques

**Project Goal:** Achieve 99% accuracy in threat detection using advanced SVM techniques

**Key Improvements:**
- Advanced feature engineering and selection
- Comprehensive outlier detection and removal
- Extensive hyperparameter optimization
- Multiple scaling strategies
- Advanced sampling techniques for class imbalance
- Ensemble methods

## 1. Import Required Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold

# Feature selection
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif, RFE, VarianceThreshold
from sklearn.decomposition import PCA

# Models
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, VotingClassifier

# Imbalanced data handling
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.combine import SMOTETomek

# Metrics and evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ All libraries imported successfully!")

## 2. Load and Initial Data Exploration

In [None]:
# Load dataset
df = pd.read_csv("Dataset-Brief 1 Cyber.csv")

print(f"Dataset Shape: {df.shape}")
print(f"\nOriginal Label Distribution:")
print(df['Label'].value_counts())
print(f"\nBasic Statistics:")
print(df.describe())

# Check for missing values
print(f"\nMissing Values: {df.isnull().sum().sum()}")
print(f"Duplicate Rows: {df.duplicated().sum()}")

df.head()

## 3. Advanced Data Preprocessing

In [None]:
# Remove identifier columns (not useful for ML)
df_clean = df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'])

# Separate features and target
labels_original = df_clean['Label']
features = df_clean.drop(columns=['Label'])

# Remove columns with all zeros
zero_cols = features.columns[(features == 0).all()]
print(f"Columns with all zeros ({len(zero_cols)}): {list(zero_cols)}")
features = features.drop(columns=zero_cols)

# Replace infinite values with NaN, then fill with median
features = features.replace([np.inf, -np.inf], np.nan)
if features.isnull().sum().sum() > 0:
    print(f"\nHandling {features.isnull().sum().sum()} NaN values...")
    features = features.fillna(features.median())

print(f"\nShape after initial cleaning: {features.shape}")
print(f"Features remaining: {features.shape[1]}")

## 4. Label Engineering - Create 3 Main Categories

In [None]:
def group_label(lbl):
    """
    Group 11 original attack types into 3 main categories
    
    Categories:
    1. Normal: Benign traffic and analysis
    2. Network_Attacks: Traffic-based attacks (DoS, Exploits, etc.)
    3. Malware_CodeAttacks: Code-based threats (Backdoor, Shellcode, Worms)
    """
    if lbl in ['Benign', 'Analysis']:
        return 'Normal'
    elif lbl in ['DoS', 'Exploits', 'Generic', 'Fuzzers', 'Reconnaissance']:
        return 'Network_Attacks'
    elif lbl in ['Backdoor', 'Shellcode', 'Worms']:
        return 'Malware_CodeAttacks'
    else:
        return 'Unknown'

# Apply label grouping
df_clean['MainLabel'] = labels_original.map(group_label)

print("Label Distribution after Grouping:")
print(df_clean['MainLabel'].value_counts())
print(f"\nPercentage Distribution:")
print(df_clean['MainLabel'].value_counts(normalize=True) * 100)

## 5. Advanced Feature Engineering

In [None]:
# Create a copy of features for engineering
features_engineered = features.copy()

# 1. Ratio features (help capture relationships between metrics)
if 'Total Fwd Packet' in features.columns and 'Total Bwd packets' in features.columns:
    features_engineered['Fwd_Bwd_Packet_Ratio'] = features['Total Fwd Packet'] / (features['Total Bwd packets'] + 1)

if 'Flow Duration' in features.columns and 'Total Fwd Packet' in features.columns:
    features_engineered['Packet_Rate'] = features['Total Fwd Packet'] / (features['Flow Duration'] + 1)

if 'Flow Duration' in features.columns and 'Total Bwd packets' in features.columns:
    features_engineered['Bwd_Packet_Rate'] = features['Total Bwd packets'] / (features['Flow Duration'] + 1)

# 2. Log transformation for highly skewed features (helps normalize distribution)
skewed_features = features_engineered.columns[features_engineered.skew() > 1]
print(f"\nApplying log transformation to {len(skewed_features)} skewed features...")
for col in skewed_features:
    if (features_engineered[col] >= 0).all():  # Only if all values are non-negative
        features_engineered[f'{col}_log'] = np.log1p(features_engineered[col])

# 3. Square root transformation for variance stabilization
variance_cols = features_engineered.columns[features_engineered.var() > features_engineered.var().quantile(0.9)]
for col in variance_cols[:5]:  # Limit to top 5 to avoid too many features
    if (features_engineered[col] >= 0).all():
        features_engineered[f'{col}_sqrt'] = np.sqrt(features_engineered[col])

print(f"\nShape after feature engineering: {features_engineered.shape}")
print(f"Added {features_engineered.shape[1] - features.shape[1]} new features")

## 6. Outlier Detection and Removal

In [None]:
from sklearn.ensemble import IsolationForest

# Use Isolation Forest for outlier detection
print("Detecting outliers using Isolation Forest...")
iso_forest = IsolationForest(contamination=0.05, random_state=42, n_jobs=-1)
outlier_pred = iso_forest.fit_predict(features_engineered)

# Keep only inliers (outlier_pred == 1)
inlier_mask = outlier_pred == 1
print(f"\nOutliers detected: {(~inlier_mask).sum()} ({(~inlier_mask).sum()/len(features_engineered)*100:.2f}%)")
print(f"Samples retained: {inlier_mask.sum()} ({inlier_mask.sum()/len(features_engineered)*100:.2f}%)")

# Apply mask to features and labels
features_clean = features_engineered[inlier_mask]
labels_clean = df_clean['MainLabel'][inlier_mask]

print(f"\nFinal shape after outlier removal: {features_clean.shape}")
print(f"\nLabel distribution after outlier removal:")
print(labels_clean.value_counts())

## 7. Feature Selection - Remove Low Variance and Highly Correlated Features

In [None]:
# 1. Variance Threshold - Remove features with very low variance
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)
features_var = pd.DataFrame(
    selector.fit_transform(features_clean),
    columns=features_clean.columns[selector.get_support()],
    index=features_clean.index
)
print(f"Features after variance threshold: {features_var.shape[1]}")
print(f"Removed {features_clean.shape[1] - features_var.shape[1]} low-variance features")

# 2. Remove highly correlated features (correlation > 0.95)
corr_matrix = features_var.corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.95)]

print(f"\nHighly correlated features to remove: {len(to_drop)}")
features_uncorr = features_var.drop(columns=to_drop)
print(f"Features after correlation filtering: {features_uncorr.shape[1]}")

## 8. Label Encoding and Train-Test Split

In [None]:
# Encode labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels_clean)

print("Class mapping:")
for idx, name in enumerate(label_encoder.classes_):
    print(f"  {idx} = {name}")

# Train/Test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    features_uncorr, y,
    test_size=0.25,  # Slightly smaller test set for more training data
    random_state=42,
    stratify=y
)

print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
print(f"Features: {X_train.shape[1]}")

## 9. Advanced Feature Scaling - Test Multiple Scalers

In [None]:
# Test different scalers to find the best one
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

scaler_results = {}

print("Testing different scalers with a quick SVM model...\n")

for scaler_name, scaler in scalers.items():
    # Scale data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Quick SVM test
    quick_svm = SVC(kernel='rbf', C=10, gamma='scale', class_weight='balanced', random_state=42)
    quick_svm.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = quick_svm.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    scaler_results[scaler_name] = acc
    
    print(f"{scaler_name}: {acc:.4f}")

# Select best scaler
best_scaler_name = max(scaler_results, key=scaler_results.get)
best_scaler = scalers[best_scaler_name]

print(f"\n‚úÖ Best scaler: {best_scaler_name} with accuracy {scaler_results[best_scaler_name]:.4f}")

# Apply best scaler
X_train_scaled = best_scaler.fit_transform(X_train)
X_test_scaled = best_scaler.transform(X_test)

## 10. Advanced Feature Selection with SelectKBest

In [None]:
# Test different k values to find optimal number of features
k_values = [30, 40, 50, 60, 70, 'all']
k_results = {}

print("Testing different numbers of features using SelectKBest...\n")

for k in k_values:
    if k == 'all':
        X_train_selected = X_train_scaled
        X_test_selected = X_test_scaled
    else:
        # Use mutual information for feature selection
        selector = SelectKBest(mutual_info_classif, k=min(k, X_train_scaled.shape[1]))
        X_train_selected = selector.fit_transform(X_train_scaled, y_train)
        X_test_selected = selector.transform(X_test_scaled)
    
    # Quick SVM test
    quick_svm = SVC(kernel='rbf', C=10, gamma='scale', class_weight='balanced', random_state=42)
    quick_svm.fit(X_train_selected, y_train)
    
    # Evaluate
    y_pred = quick_svm.predict(X_test_selected)
    acc = accuracy_score(y_test, y_pred)
    k_results[k] = acc
    
    print(f"k={k}: {acc:.4f}")

# Select best k
best_k = max(k_results, key=k_results.get)
print(f"\n‚úÖ Best k: {best_k} with accuracy {k_results[best_k]:.4f}")

# Apply best feature selection
if best_k == 'all':
    X_train_final = X_train_scaled
    X_test_final = X_test_scaled
    selected_features = features_uncorr.columns.tolist()
else:
    selector_final = SelectKBest(mutual_info_classif, k=min(best_k, X_train_scaled.shape[1]))
    X_train_final = selector_final.fit_transform(X_train_scaled, y_train)
    X_test_final = selector_final.transform(X_test_scaled)
    selected_features = features_uncorr.columns[selector_final.get_support()].tolist()

print(f"\nFinal feature count: {X_train_final.shape[1]}")
print(f"\nTop 10 selected features:")
for i, feat in enumerate(selected_features[:10], 1):
    print(f"{i}. {feat}")

## 11. Handle Class Imbalance - Test Multiple SMOTE Variants

In [None]:
# Test different SMOTE variants
smote_variants = {
    'SMOTE': SMOTE(random_state=42, k_neighbors=5),
    'BorderlineSMOTE': BorderlineSMOTE(random_state=42, k_neighbors=5),
    'SVMSMOTE': SVMSMOTE(random_state=42, k_neighbors=5),
    'ADASYN': ADASYN(random_state=42, n_neighbors=5)
}

smote_results = {}

print("Testing different SMOTE variants...\n")

for smote_name, smote in smote_variants.items():
    try:
        # Resample
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_final, y_train)
        
        # Quick SVM test
        quick_svm = SVC(kernel='rbf', C=10, gamma='scale', class_weight='balanced', random_state=42)
        quick_svm.fit(X_train_resampled, y_train_resampled)
        
        # Evaluate
        y_pred = quick_svm.predict(X_test_final)
        acc = accuracy_score(y_test, y_pred)
        smote_results[smote_name] = acc
        
        print(f"{smote_name}: {acc:.4f} (Samples: {X_train_resampled.shape[0]})")
    except Exception as e:
        print(f"{smote_name}: Failed - {str(e)}")

# Select best SMOTE variant
if smote_results:
    best_smote_name = max(smote_results, key=smote_results.get)
    best_smote = smote_variants[best_smote_name]
    print(f"\n‚úÖ Best SMOTE variant: {best_smote_name} with accuracy {smote_results[best_smote_name]:.4f}")
    
    # Apply best SMOTE
    X_train_resampled, y_train_resampled = best_smote.fit_resample(X_train_final, y_train)
    
    print(f"\nClass distribution after {best_smote_name}:")
    unique, counts = np.unique(y_train_resampled, return_counts=True)
    for cls, count in zip(unique, counts):
        print(f"  Class {label_encoder.classes_[cls]}: {count}")
else:
    print("\nNo SMOTE variant succeeded, using original training data")
    X_train_resampled = X_train_final
    y_train_resampled = y_train

## 12. Extensive Hyperparameter Tuning - RandomizedSearchCV followed by GridSearchCV

In [None]:
# Phase 1: Randomized Search for broad exploration
print("Phase 1: Randomized Search for initial hyperparameter exploration...\n")

from scipy.stats import uniform, loguniform

random_param_dist = {
    'C': loguniform(0.1, 1000),
    'gamma': ['scale', 'auto'] + list(loguniform(0.0001, 1).rvs(10)),
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'class_weight': ['balanced', None],
    'degree': [2, 3, 4]  # Only used for poly kernel
}

random_search = RandomizedSearchCV(
    estimator=SVC(random_state=42),
    param_distributions=random_param_dist,
    n_iter=50,  # Number of parameter combinations to try
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42,
    scoring='accuracy'
)

random_search.fit(X_train_resampled, y_train_resampled)

print(f"\n‚úÖ Best parameters from RandomizedSearch: {random_search.best_params_}")
print(f"‚úÖ Best CV score: {random_search.best_score_:.4f}")

# Test on test set
y_pred_random = random_search.best_estimator_.predict(X_test_final)
acc_random = accuracy_score(y_test, y_pred_random)
print(f"‚úÖ Test accuracy: {acc_random:.4f}")

In [None]:
# Phase 2: Grid Search for fine-tuning around best parameters
print("\nPhase 2: Grid Search for fine-tuning hyperparameters...\n")

# Get best parameters from random search
best_kernel = random_search.best_params_['kernel']
best_C = random_search.best_params_['C']
best_gamma = random_search.best_params_['gamma']

# Create fine-tuning grid around best parameters
if isinstance(best_gamma, str):
    gamma_range = ['scale', 'auto', 0.001, 0.01, 0.1]
else:
    gamma_range = [best_gamma/10, best_gamma/2, best_gamma, best_gamma*2, best_gamma*10, 'scale', 'auto']

grid_param = {
    'C': [best_C/10, best_C/5, best_C/2, best_C, best_C*2, best_C*5, best_C*10],
    'gamma': gamma_range,
    'kernel': [best_kernel],
    'class_weight': ['balanced']
}

# Add degree parameter if poly kernel
if best_kernel == 'poly':
    grid_param['degree'] = [2, 3, 4, 5]

grid_search = GridSearchCV(
    estimator=SVC(random_state=42),
    param_grid=grid_param,
    cv=5,  # More folds for better validation
    verbose=2,
    n_jobs=-1,
    scoring='accuracy'
)

grid_search.fit(X_train_resampled, y_train_resampled)

print(f"\n‚úÖ Best parameters from GridSearch: {grid_search.best_params_}")
print(f"‚úÖ Best CV score: {grid_search.best_score_:.4f}")

# Test on test set
best_svm = grid_search.best_estimator_
y_pred_grid = best_svm.predict(X_test_final)
acc_grid = accuracy_score(y_test, y_pred_grid)
print(f"‚úÖ Test accuracy: {acc_grid:.4f}")

## 13. Ensemble Method - Bagging for Additional Boost

In [None]:
# Create Bagging ensemble with best SVM
print("Creating Bagging ensemble with best SVM...\n")

bagging_svm = BaggingClassifier(
    estimator=best_svm,
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

bagging_svm.fit(X_train_resampled, y_train_resampled)

# Predict
y_pred_bagging = bagging_svm.predict(X_test_final)
acc_bagging = accuracy_score(y_test, y_pred_bagging)

print(f"‚úÖ Bagging SVM accuracy: {acc_bagging:.4f}")

# Compare with single SVM
if acc_bagging > acc_grid:
    print(f"\nüéâ Bagging improved accuracy by {(acc_bagging - acc_grid)*100:.2f}%")
    final_model = bagging_svm
    final_pred = y_pred_bagging
    final_accuracy = acc_bagging
else:
    print(f"\n‚úÖ Single SVM performs better, using it as final model")
    final_model = best_svm
    final_pred = y_pred_grid
    final_accuracy = acc_grid

## 14. Comprehensive Model Evaluation

In [None]:
# Decode predictions and actual labels
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(final_pred)
class_names = list(label_encoder.classes_)

print("="*80)
print("FINAL MODEL PERFORMANCE")
print("="*80)
print(f"\nüéØ ACCURACY: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print(f"\nüìä Classification Report:\n")
print(classification_report(
    y_test_labels,
    y_pred_labels,
    labels=class_names,
    digits=4
))

# Calculate additional metrics
precision = precision_score(y_test, final_pred, average='weighted')
recall = recall_score(y_test, final_pred, average='weighted')
f1 = f1_score(y_test, final_pred, average='weighted')

print(f"\nüìà Weighted Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1-Score: {f1:.4f}")

## 15. Cross-Validation for Robust Evaluation

In [None]:
# Perform stratified k-fold cross-validation
print("Performing 10-fold stratified cross-validation...\n")

cv_scores = cross_val_score(
    best_svm,  # Use best single SVM for faster CV
    X_train_resampled,
    y_train_resampled,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Cross-Validation Scores: {cv_scores}")
print(f"\nüìä CV Statistics:")
print(f"  Mean Accuracy: {cv_scores.mean():.4f}")
print(f"  Std Deviation: {cv_scores.std():.4f}")
print(f"  Min Accuracy: {cv_scores.min():.4f}")
print(f"  Max Accuracy: {cv_scores.max():.4f}")
print(f"  95% Confidence Interval: [{cv_scores.mean() - 1.96*cv_scores.std():.4f}, {cv_scores.mean() + 1.96*cv_scores.std():.4f}]")

## 16. Confusion Matrix Visualization

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=class_names)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={'label': 'Count'}
)

plt.title(f"Confusion Matrix - Advanced SVM Model\nAccuracy: {final_accuracy:.4f}", fontsize=14, fontweight='bold')
plt.xlabel("Predicted Category", fontsize=12)
plt.ylabel("Actual Category", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Calculate and display confusion matrix percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_percent,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={'label': 'Percentage (%)'}
)

plt.title(f"Confusion Matrix (Normalized) - Advanced SVM Model\nAccuracy: {final_accuracy:.4f}", fontsize=14, fontweight='bold')
plt.xlabel("Predicted Category", fontsize=12)
plt.ylabel("Actual Category", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 17. Performance Metrics Visualization

In [None]:
# Get classification report as dictionary
report = classification_report(
    y_test_labels,
    y_pred_labels,
    labels=class_names,
    output_dict=True
)

# Extract metrics
metrics_df = pd.DataFrame({
    'Precision': [report[cls]['precision'] for cls in class_names],
    'Recall': [report[cls]['recall'] for cls in class_names],
    'F1-Score': [report[cls]['f1-score'] for cls in class_names]
}, index=class_names)

# Plot grouped bar chart
fig, ax = plt.subplots(figsize=(12, 6))
metrics_df.plot(kind='bar', ax=ax, width=0.8)

plt.title(f"Performance Metrics by Class - Advanced SVM Model\nOverall Accuracy: {final_accuracy:.4f}", 
          fontsize=14, fontweight='bold')
plt.xlabel("Threat Category", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.ylim(0, 1.05)
plt.legend(title='Metrics', fontsize=10)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Display metrics table
print("\nüìä Metrics Summary by Class:")
print(metrics_df.round(4))

## 18. Feature Importance Analysis

In [None]:
# For SVM, we can use permutation importance to understand feature importance
from sklearn.inspection import permutation_importance

print("Calculating feature importance using permutation importance...\n")

# Use a sample for faster computation
sample_size = min(2000, X_test_final.shape[0])
sample_indices = np.random.choice(X_test_final.shape[0], sample_size, replace=False)

perm_importance = permutation_importance(
    best_svm,
    X_test_final[sample_indices],
    y_test[sample_indices],
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

# Get feature importances
importance_df = pd.DataFrame({
    'feature': selected_features,
    'importance': perm_importance.importances_mean,
    'std': perm_importance.importances_std
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['importance'], xerr=top_features['std'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance (Decrease in Accuracy)', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 20 Most Important Features - Permutation Importance', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 20 Most Important Features:")
print(importance_df.head(20).to_string(index=False))

## 19. Model Summary and Results

In [None]:
print("="*80)
print("ADVANCED SVM THREAT DETECTION MODEL - FINAL SUMMARY")
print("="*80)

print("\nüìä DATASET INFORMATION:")
print(f"  Original samples: {df.shape[0]}")
print(f"  Samples after outlier removal: {features_clean.shape[0]}")
print(f"  Original features: {df.shape[1] - 1}")
print(f"  Features after engineering: {features_engineered.shape[1]}")
print(f"  Final selected features: {X_train_final.shape[1]}")

print("\nüîß PREPROCESSING PIPELINE:")
print(f"  1. Removed identifier columns: Flow ID, Src IP, Dst IP, Timestamp")
print(f"  2. Removed zero-variance columns: {len(zero_cols)}")
print(f"  3. Feature engineering: Added ratio, log, and sqrt features")
print(f"  4. Outlier removal: Isolation Forest (5% contamination)")
print(f"  5. Variance threshold filtering")
print(f"  6. Correlation-based feature removal (threshold: 0.95)")
print(f"  7. Best scaler: {best_scaler_name}")
print(f"  8. Feature selection: SelectKBest (k={best_k})")
print(f"  9. Best SMOTE variant: {best_smote_name}")

print("\nüéØ MODEL CONFIGURATION:")
print(f"  Best parameters: {grid_search.best_params_}")
print(f"  Model type: {'Bagging SVM Ensemble' if final_model == bagging_svm else 'Single SVM'}")

print("\nüèÜ PERFORMANCE METRICS:")
print(f"  Test Accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print(f"  Cross-Validation Accuracy: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
print(f"  Weighted Precision: {precision:.4f}")
print(f"  Weighted Recall: {recall:.4f}")
print(f"  Weighted F1-Score: {f1:.4f}")

print("\nüìà PER-CLASS PERFORMANCE:")
for cls in class_names:
    print(f"  {cls}:")
    print(f"    Precision: {report[cls]['precision']:.4f}")
    print(f"    Recall: {report[cls]['recall']:.4f}")
    print(f"    F1-Score: {report[cls]['f1-score']:.4f}")
    print(f"    Support: {int(report[cls]['support'])}")

if final_accuracy >= 0.99:
    print("\n" + "="*80)
    print("üéâüéâüéâ SUCCESS! TARGET ACCURACY OF 0.99 ACHIEVED! üéâüéâüéâ")
    print("="*80)
else:
    print(f"\n‚ö†Ô∏è Current accuracy: {final_accuracy:.4f}")
    print(f"   Gap to target: {(0.99 - final_accuracy)*100:.2f}%")
    print("\nüí° Recommendations for further improvement:")
    print("   1. Collect more training data")
    print("   2. Try deep learning models (Neural Networks)")
    print("   3. Ensemble with other models (XGBoost, Random Forest)")
    print("   4. More advanced feature engineering")
    print("   5. Fine-tune class weights based on misclassifications")

## 20. Save Model and Results

In [None]:
import joblib
from datetime import datetime

# Create timestamp for file naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the model
model_filename = f"svm_threat_detection_model_{final_accuracy:.4f}_{timestamp}.pkl"
joblib.dump(final_model, model_filename)
print(f"‚úÖ Model saved as: {model_filename}")

# Save the scaler
scaler_filename = f"scaler_{timestamp}.pkl"
joblib.dump(best_scaler, scaler_filename)
print(f"‚úÖ Scaler saved as: {scaler_filename}")

# Save the label encoder
encoder_filename = f"label_encoder_{timestamp}.pkl"
joblib.dump(label_encoder, encoder_filename)
print(f"‚úÖ Label encoder saved as: {encoder_filename}")

# Save selected features
features_filename = f"selected_features_{timestamp}.txt"
with open(features_filename, 'w') as f:
    for feat in selected_features:
        f.write(f"{feat}\n")
print(f"‚úÖ Selected features saved as: {features_filename}")

# Save performance summary
summary_filename = f"performance_summary_{final_accuracy:.4f}_{timestamp}.txt"
with open(summary_filename, 'w') as f:
    f.write("ADVANCED SVM THREAT DETECTION MODEL - PERFORMANCE SUMMARY\n")
    f.write("="*80 + "\n\n")
    f.write(f"Accuracy: {final_accuracy:.4f}\n")
    f.write(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1-Score: {f1:.4f}\n\n")
    f.write(f"Best Parameters: {grid_search.best_params_}\n\n")
    f.write("Classification Report:\n")
    f.write(classification_report(y_test_labels, y_pred_labels, labels=class_names))

print(f"‚úÖ Performance summary saved as: {summary_filename}")

print("\n" + "="*80)
print("ALL RESULTS SAVED SUCCESSFULLY!")
print("="*80)