# SVM Network Threat Detection - Targeting 99% Accuracy
## Simple and Effective Approach

**Goal:** Achieve 99% accuracy using SVM with focused, proven techniques

**Strategy:**
1. Clean data properly
2. Select best features
3. Handle class imbalance
4. Optimize SVM hyperparameters
5. Validate thoroughly

## Step 1: Import Libraries and Load Data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries loaded successfully!")

In [None]:
# Load dataset
df = pd.read_csv("Dataset-Brief 1 Cyber.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['Label'].value_counts())
df.head()

## Step 2: Data Preprocessing

In [None]:
# Remove non-predictive columns
df_clean = df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'])

# Remove columns with all zeros (no information)
features = df_clean.drop(columns=['Label'])
zero_cols = features.columns[(features == 0).all()]
print(f"Removing {len(zero_cols)} zero-only columns: {list(zero_cols)}")
features = features.drop(columns=zero_cols)

# Handle infinite and missing values
features = features.replace([np.inf, -np.inf], np.nan)
features = features.fillna(features.median())

print(f"\nCleaned features shape: {features.shape}")

## Step 3: Group Labels into 3 Categories

In [None]:
# Group 11 attack types into 3 main categories
def group_label(lbl):
    if lbl in ['Benign', 'Analysis']:
        return 'Normal'
    elif lbl in ['DoS', 'Exploits', 'Generic', 'Fuzzers', 'Reconnaissance']:
        return 'Network_Attacks'
    elif lbl in ['Backdoor', 'Shellcode', 'Worms']:
        return 'Malware_CodeAttacks'
    else:
        return 'Unknown'

labels = df_clean['Label'].map(group_label)

print("Label distribution after grouping:")
print(labels.value_counts())
print(f"\nPercentages:")
print(labels.value_counts(normalize=True) * 100)

## Step 4: Remove Outliers (Key Improvement!)

Removing outliers significantly improves accuracy by cleaning noisy data.

In [None]:
from sklearn.ensemble import IsolationForest

# Detect and remove outliers using Isolation Forest
print("Detecting outliers...")
iso_forest = IsolationForest(contamination=0.05, random_state=42, n_jobs=-1)
outlier_pred = iso_forest.fit_predict(features)

# Keep only inliers
inlier_mask = outlier_pred == 1
features_clean = features[inlier_mask]
labels_clean = labels[inlier_mask]

print(f"Removed {(~inlier_mask).sum()} outliers ({(~inlier_mask).sum()/len(features)*100:.2f}%)")
print(f"Clean dataset: {features_clean.shape[0]} samples, {features_clean.shape[1]} features")

## Step 5: Feature Selection - Keep Only Best Features

Using too many features adds noise. We'll select the most important ones.

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels_clean)

print("Class mapping:")
for idx, name in enumerate(label_encoder.classes_):
    print(f"  {idx} = {name}")

# Select top 50 best features using ANOVA F-test
print("\nSelecting best features...")
selector = SelectKBest(f_classif, k=50)
X_selected = selector.fit_transform(features_clean, y_encoded)

# Get selected feature names
selected_features = features_clean.columns[selector.get_support()].tolist()

print(f"Selected {X_selected.shape[1]} best features")
print(f"\nTop 10 features:")
for i, feat in enumerate(selected_features[:10], 1):
    print(f"  {i}. {feat}")

## Step 6: Train-Test Split

In [None]:
# Split data: 75% training, 25% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y_encoded,
    test_size=0.25,
    random_state=42,
    stratify=y_encoded
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

## Step 7: Feature Scaling

SVM requires scaled features for best performance.

In [None]:
# Standardize features (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Features scaled successfully")

## Step 8: Handle Class Imbalance with SMOTE

SMOTE creates synthetic samples for minority classes.

In [None]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print("Class distribution after SMOTE:")
unique, counts = np.unique(y_train_balanced, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  {label_encoder.classes_[cls]}: {count}")

print(f"\nTotal training samples after SMOTE: {X_train_balanced.shape[0]}")

## Step 9: Hyperparameter Tuning with GridSearchCV

Finding the best SVM parameters is crucial for 99% accuracy.

In [None]:
# Define parameter grid
param_grid = {
    'C': [10, 50, 100, 200, 500],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'kernel': ['rbf'],
    'class_weight': ['balanced']
}

print("Starting GridSearchCV...")
print(f"Testing {len(param_grid['C']) * len(param_grid['gamma'])} parameter combinations")
print("This may take 5-10 minutes...\n")

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=SVC(random_state=42),
    param_grid=param_grid,
    cv=5,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy'
)

grid_search.fit(X_train_balanced, y_train_balanced)

print(f"\n‚úÖ Best parameters: {grid_search.best_params_}")
print(f"‚úÖ Best cross-validation score: {grid_search.best_score_:.4f}")

## Step 10: Evaluate Model Performance

In [None]:
# Get best model and predict
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("="*60)
print("MODEL PERFORMANCE")
print("="*60)
print(f"\nüéØ ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")

if accuracy >= 0.99:
    print("\nüéâ SUCCESS! Achieved 99%+ accuracy!")
else:
    print(f"\n‚ö†Ô∏è Current: {accuracy*100:.2f}% | Target: 99.00%")
    print(f"   Gap: {(0.99 - accuracy)*100:.2f}%")

In [None]:
# Detailed classification report
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)
class_names = list(label_encoder.classes_)

print("\nüìä CLASSIFICATION REPORT:\n")
print(classification_report(y_test_labels, y_pred_labels, labels=class_names, digits=4))

## Step 11: Confusion Matrix Visualization

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=class_names)

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.title(f'Confusion Matrix\nAccuracy: {accuracy:.4f}', fontsize=14, fontweight='bold')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.tight_layout()
plt.show()

# Normalized confusion matrix (percentages)
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

plt.figure(figsize=(10, 8))
sns.heatmap(cm_percent, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.title(f'Normalized Confusion Matrix (%)\nAccuracy: {accuracy:.4f}', 
          fontsize=14, fontweight='bold')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.tight_layout()
plt.show()

## Step 12: Cross-Validation for Robust Evaluation

In [None]:
# 10-fold cross-validation
print("Performing 10-fold cross-validation...\n")
cv_scores = cross_val_score(best_svm, X_train_balanced, y_train_balanced, 
                            cv=10, scoring='accuracy', n_jobs=-1)

print(f"Cross-validation scores: {cv_scores}")
print(f"\nMean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Std Deviation: {cv_scores.std():.4f}")
print(f"Min: {cv_scores.min():.4f} | Max: {cv_scores.max():.4f}")

## Step 13: Performance Metrics Visualization

In [None]:
# Get metrics for each class
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate per-class metrics
report = classification_report(y_test_labels, y_pred_labels, 
                              labels=class_names, output_dict=True)

metrics_df = pd.DataFrame({
    'Precision': [report[cls]['precision'] for cls in class_names],
    'Recall': [report[cls]['recall'] for cls in class_names],
    'F1-Score': [report[cls]['f1-score'] for cls in class_names]
}, index=class_names)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
metrics_df.plot(kind='bar', ax=ax, width=0.8)
plt.title(f'Performance Metrics by Class\nOverall Accuracy: {accuracy:.4f}', 
          fontsize=14, fontweight='bold')
plt.xlabel('Threat Category', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.ylim(0, 1.05)
plt.legend(title='Metrics', fontsize=10)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nMetrics Summary:")
print(metrics_df.round(4))

## Step 14: Final Summary

In [None]:
print("="*70)
print("FINAL MODEL SUMMARY")
print("="*70)

print("\nüìä DATASET:")
print(f"  Original samples: {df.shape[0]}")
print(f"  After outlier removal: {features_clean.shape[0]}")
print(f"  Features selected: {X_selected.shape[1]} (from {features.shape[1]})")

print("\nüîß TECHNIQUES APPLIED:")
print("  1. Removed outliers with Isolation Forest (5%)")
print("  2. Selected top 50 features using ANOVA F-test")
print("  3. Standardized features with StandardScaler")
print("  4. Balanced classes with SMOTE")
print("  5. Optimized SVM with GridSearchCV")

print("\nüéØ BEST PARAMETERS:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print("\nüèÜ PERFORMANCE:")
print(f"  Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"  CV Accuracy: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")

print("\nüìà PER-CLASS RESULTS:")
for cls in class_names:
    print(f"  {cls}:")
    print(f"    Precision: {report[cls]['precision']:.4f}")
    print(f"    Recall: {report[cls]['recall']:.4f}")
    print(f"    F1-Score: {report[cls]['f1-score']:.4f}")

if accuracy >= 0.99:
    print("\n" + "="*70)
    print("üéâ SUCCESS! TARGET OF 99% ACCURACY ACHIEVED! üéâ")
    print("="*70)
else:
    print(f"\n‚ö†Ô∏è Current accuracy: {accuracy*100:.2f}%")
    print(f"   Target: 99.00%")
    print(f"   Gap: {(0.99 - accuracy)*100:.2f}%")
    print("\nüí° To improve further:")
    print("   - Try different C values: [100, 500, 1000]")
    print("   - Adjust gamma: [0.001, 0.01, 0.1, 1]")
    print("   - Experiment with k in SelectKBest: [40, 50, 60]")
    print("   - Try polynomial kernel: kernel='poly', degree=3")

## Step 15: Save Model (Optional)

In [None]:
import joblib
from datetime import datetime

# Save model and preprocessing objects
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

joblib.dump(best_svm, f'svm_model_{accuracy:.4f}_{timestamp}.pkl')
joblib.dump(scaler, f'scaler_{timestamp}.pkl')
joblib.dump(selector, f'feature_selector_{timestamp}.pkl')
joblib.dump(label_encoder, f'label_encoder_{timestamp}.pkl')

print("‚úÖ Model and preprocessing objects saved!")
print(f"   - SVM model: svm_model_{accuracy:.4f}_{timestamp}.pkl")
print(f"   - Scaler: scaler_{timestamp}.pkl")
print(f"   - Feature selector: feature_selector_{timestamp}.pkl")
print(f"   - Label encoder: label_encoder_{timestamp}.pkl")