In [None]:
# Cell 1 - imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


random_seed = 42


In [None]:
# Cell 2 - Load Data

data_dir_name = "data/"
file_list = [ 
             "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", 
             "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
             "Friday-WorkingHours-Morning.pcap_ISCX.csv",
             "Monday-WorkingHours.pcap_ISCX.csv",
             "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
             "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
             "Tuesday-WorkingHours.pcap_ISCX.csv",
             "Wednesday-workingHours.pcap_ISCX.csv"
             ]

# Combine all files into a single DataFrame
data_frames = []
df = pd.DataFrame()
for file_name in file_list:
    df = pd.read_csv(data_dir_name + file_name)
    data_frames.append(df)
combined_data = pd.concat(data_frames, ignore_index=True)


print("Combined data shape:", combined_data.shape)
print("Columns in the dataset:", combined_data.columns.tolist())
# Display first few rows of the combined dataset
print(combined_data.head())

In [None]:
# Cell 3 - Visualize the distribution of the target variable

#clean column name if needed
combined_data.columns = combined_data.columns.str.strip()

#clean label values
combined_data['Label'] = combined_data['Label'].str.strip()

#get distribution
label_counts = combined_data['Label'].value_counts()

#plot
plt.figure(figsize=(12, 6))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.xticks(rotation=45, ha='right')
plt.title("Distribution of Traffic Types in Wednesday Dataset")
plt.xlabel("Traffic Label")
plt.ylabel("Number of Records")
plt.grid(True)
plt.tight_layout()
plt.show()

#shows class ratios
print(label_counts)

In [None]:
# Cell 4

# Drop columns with all drop IPS, ports, IDs, etc.
# to avoid data leakage
columns_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Fwd Header Length.1']  # adjust as needed

for col in columns_to_drop:
    if col in combined_data.columns:
        combined_data.drop(col, axis=1, inplace=True)

#drop constant columns(same value or not unique value for every row)
nunique = combined_data.nunique()
constant_cols = nunique[nunique <= 1].index.tolist()
combined_data.drop(constant_cols, axis=1, inplace=True)

#print remaining columns
print(f"Remaining columns: {combined_data.columns.tolist()}")
print(f"Data shape after dropping columns: {combined_data.shape}")

In [None]:
# Cell 5 - Handle Missing Values

# handle missing values if any
missing_values = combined_data.isnull().sum()
print("Missing values in each column:\n", missing_values[missing_values > 0])
# For simplicity, we will drop rows with missing values
combined_data.dropna(inplace=True)
print("Data shape after dropping missing values:", combined_data.shape)


In [None]:
# Cell 6 - Data Split
# Split the data into training and testing sets (80-20 split) equal amount of benign and malicious samples
train_data, test_data = train_test_split(combined_data, test_size=0.2, stratify=combined_data['Label'], random_state=random_seed)

print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)
print("Training data label distribution:\n", train_data['Label'].value_counts())
print("Testing data label distribution:\n", test_data['Label'].value_counts())




In [None]:
# Cell 7 - OCSVM Training Block (Fixed)

from sklearn.svm import OneClassSVM
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

print("="*60)
print("DATA CLEANING AND PREPROCESSING")
print("="*60)

# Create binary classification: BENIGN vs ATTACK
combined_data['Attack'] = (combined_data['Label'] != 'BENIGN').astype(int)

# Separate features and target
X = combined_data.drop(['Label', 'Attack'], axis=1)
y = combined_data['Attack']

print(f"Initial data shape: {X.shape}")

# Step 1: Check for and handle infinite values
print("\nStep 1: Checking for infinite values...")
inf_mask = np.isinf(X.values)
inf_count = np.sum(inf_mask)
print(f"Total infinite values found: {inf_count}")

if inf_count > 0:
    inf_cols = X.columns[np.isinf(X.values).any(axis=0)]
    print(f"Columns with infinite values: {list(inf_cols)}")
    
    # Replace infinite values with NaN
    X = X.replace([np.inf, -np.inf], np.nan)
    print("Infinite values replaced with NaN")

# Step 2: Check for extremely large values
print("\nStep 2: Checking for extremely large values...")
large_threshold = 1e10
large_mask = np.abs(X.values) > large_threshold
large_count = np.sum(large_mask)
print(f"Values larger than {large_threshold}: {large_count}")

if large_count > 0:
    large_cols = X.columns[(np.abs(X.values) > large_threshold).any(axis=0)]
    print(f"Columns with extremely large values: {list(large_cols)}")

# Step 3: Handle NaN values more aggressively
print("\nStep 3: Handling NaN values...")
initial_nan_count = X.isnull().sum().sum()
print(f"Initial NaN values: {initial_nan_count}")

if initial_nan_count > 0:
    print("NaN values per column:")
    nan_cols = X.isnull().sum()
    print(nan_cols[nan_cols > 0])
    
    # Strategy: Multiple approaches to handle NaNs
    print("\nApplying comprehensive NaN handling...")
    
    # First, try to fill with median
    for col in X.columns:
        if X[col].isnull().any():
            median_val = X[col].median()
            if not np.isnan(median_val):
                X[col].fillna(median_val, inplace=True)
                print(f"  Filled NaNs in '{col}' with median: {median_val:.2f}")
            else:
                # If median is also NaN, use mean
                mean_val = X[col].mean()
                if not np.isnan(mean_val):
                    X[col].fillna(mean_val, inplace=True)
                    print(f"  Filled NaNs in '{col}' with mean: {mean_val:.2f}")
                else:
                    # If both median and mean are NaN, use 0
                    X[col].fillna(0, inplace=True)
                    print(f"  Filled NaNs in '{col}' with 0")

# Step 4: Remove any remaining problematic rows
print("\nStep 4: Final NaN cleanup...")
remaining_nans = X.isnull().sum().sum()
print(f"Remaining NaN values: {remaining_nans}")

if remaining_nans > 0:
    print("Dropping rows with remaining NaN values...")
    initial_rows = len(X)
    # Get indices where there are no NaN values
    clean_indices = X.dropna().index
    X = X.loc[clean_indices]
    y = y.loc[clean_indices]
    print(f"Dropped {initial_rows - len(X)} rows with NaN values")
    print(f"Remaining data shape: {X.shape}")

# Step 5: Cap extremely large values using percentiles
print("\nStep 5: Capping extreme values...")
for col in X.columns:
    # Calculate robust percentiles
    q01 = X[col].quantile(0.01)
    q99 = X[col].quantile(0.99)
    
    # Only cap if there are extreme values
    if q99 > 1e8 or q01 < -1e8:
        print(f"Capping extreme values in '{col}': [{q01:.2e}, {q99:.2e}]")
        X[col] = X[col].clip(lower=q01, upper=q99)

# Step 6: Final validation
print("\nStep 6: Final data validation...")
print(f"Final data shape: {X.shape}")
print(f"Any infinite values: {np.isinf(X.values).any()}")
print(f"Any NaN values: {X.isnull().sum().sum()}")
print(f"Data range: [{X.values.min():.2e}, {X.values.max():.2e}]")

# Ensure we have valid data
assert not np.isinf(X.values).any(), "Still have infinite values!"
assert not X.isnull().any().any(), "Still have NaN values!"
print("✓ Data validation passed!")

# Split data
print("\n" + "="*60)
print("DATA SPLITTING")
print("="*60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set class distribution:\n{pd.Series(y_train).value_counts()}")

# Extract benign samples for OCSVM training
X_train_benign = X_train[y_train == 0].copy()
print(f"OCSVM training data (benign only) shape: {X_train_benign.shape}")

# Feature Scaling
print("\n" + "="*60)
print("FEATURE SCALING")
print("="*60)

# Use StandardScaler since we've cleaned the data
scaler = StandardScaler()

print("Applying StandardScaler...")
try:
    X_train_benign_scaled = scaler.fit_transform(X_train_benign)
    X_test_scaled = scaler.transform(X_test)
    
    print("✓ Feature scaling completed successfully!")
    print(f"Scaled training data shape: {X_train_benign_scaled.shape}")
    print(f"Scaled data range: [{X_train_benign_scaled.min():.2f}, {X_train_benign_scaled.max():.2f}]")
    print(f"Scaled data mean: {X_train_benign_scaled.mean():.2f}")
    print(f"Scaled data std: {X_train_benign_scaled.std():.2f}")
    
    # Final check for problematic values in scaled data
    if np.isinf(X_train_benign_scaled).any() or np.isnan(X_train_benign_scaled).any():
        raise ValueError("Scaled data contains inf or NaN values")
        
except Exception as e:
    print(f"StandardScaler failed: {e}")
    print("Trying RobustScaler...")
    
    scaler = RobustScaler()
    X_train_benign_scaled = scaler.fit_transform(X_train_benign)
    X_test_scaled = scaler.transform(X_test)
    print("✓ RobustScaler applied successfully!")

# OCSVM Training
print("\n" + "="*60)
print("TRAINING ONE-CLASS SVM MODEL")
print("="*60)

# Corrected OCSVM parameters (removed random_state)
ocsvm_params = {
    'kernel': 'rbf',
    'gamma': 'scale',
    'nu': 0.05,           # Fraction of training errors
    'shrinking': True,
    'cache_size': 1000,   # Increased cache for better performance
    'verbose': False,
    'max_iter': -1        # No limit on iterations
}

print("OCSVM Parameters:")
for param, value in ocsvm_params.items():
    print(f"  {param}: {value}")

print(f"\nInitializing One-Class SVM...")
ocsvm_model = OneClassSVM(**ocsvm_params)

print(f"Training on {X_train_benign_scaled.shape[0]:,} benign samples...")
print(f"Number of features: {X_train_benign_scaled.shape[1]}")

try:
    # Train the model
    print("Training in progress...")
    ocsvm_model.fit(X_train_benign_scaled)
    
    print("✓ OCSVM training completed successfully!")
    print(f"Number of support vectors: {ocsvm_model.n_support_[0]:,}")
    print(f"Support vector ratio: {ocsvm_model.n_support_[0] / len(X_train_benign_scaled):.4f}")
    
except Exception as e:
    print(f"Training failed: {e}")
    print("Trying with smaller sample size...")
    
    # Use a smaller sample if training fails
    sample_size = min(100000, len(X_train_benign_scaled))
    np.random.seed(42)
    sample_indices = np.random.choice(len(X_train_benign_scaled), sample_size, replace=False)
    X_sample = X_train_benign_scaled[sample_indices]
    
    print(f"Training on reduced sample: {sample_size:,} samples")
    ocsvm_model.fit(X_sample)
    print("✓ Training completed on reduced dataset!")

# Model Evaluation
print("\n" + "="*60)
print("MODEL EVALUATION")
print("="*60)

try:
    print("Making predictions on test set...")
    y_pred_ocsvm = ocsvm_model.predict(X_test_scaled)
    
    # Convert OCSVM predictions: 1 (inlier/normal) -> 0 (benign), -1 (outlier/anomaly) -> 1 (attack)
    y_pred_binary = np.where(y_pred_ocsvm == 1, 0, 1)
    
    print("Calculating performance metrics...")
    
    # Basic metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary, zero_division=0)
    recall = recall_score(y_test, y_pred_binary, zero_division=0)
    f1 = f1_score(y_test, y_pred_binary, zero_division=0)
    
    print(f"\n📊 OCSVM Performance Metrics:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f} (Attack detection precision)")
    print(f"Recall:    {recall:.4f} (Attack detection rate)")
    print(f"F1-Score:  {f1:.4f}")
    
    # Detailed classification report
    print(f"\n📋 Detailed Classification Report:")
    print(classification_report(y_test, y_pred_binary, 
                              target_names=['Benign', 'Attack'], 
                              zero_division=0))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred_binary)
    print(f"\n🎯 Confusion Matrix:")
    print("           Predicted")
    print("         Benign  Attack")
    print(f"Actual Benign   {cm[0,0]:6d}  {cm[0,1]:6d}")
    print(f"       Attack   {cm[1,0]:6d}  {cm[1,1]:6d}")
    
    # Additional metrics
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    print(f"\n📈 Additional Metrics:")
    print(f"True Positives (Attacks detected):     {tp:,}")
    print(f"True Negatives (Benign correctly):    {tn:,}")
    print(f"False Positives (False alarms):       {fp:,}")
    print(f"False Negatives (Missed attacks):     {fn:,}")
    print(f"Specificity (True negative rate):     {specificity:.4f}")
    print(f"False Positive Rate:                  {fpr:.4f}")
    print(f"False Negative Rate:                  {fnr:.4f}")
    
    # Decision scores
    decision_scores = ocsvm_model.decision_function(X_test_scaled)
    print(f"\n🎲 Decision Scores Statistics:")
    print(f"Range:  [{np.min(decision_scores):.4f}, {np.max(decision_scores):.4f}]")
    print(f"Mean:   {np.mean(decision_scores):.4f}")
    print(f"Median: {np.median(decision_scores):.4f}")
    print(f"Std:    {np.std(decision_scores):.4f}")
    
    # Prediction distribution
    pred_counts = pd.Series(y_pred_binary).value_counts().sort_index()
    print(f"\n📊 Prediction Distribution:")
    print(f"Predicted Benign:  {pred_counts.get(0, 0):,}")
    print(f"Predicted Attack:  {pred_counts.get(1, 0):,}")
    
except Exception as e:
    print(f"❌ Evaluation failed: {e}")
    print("Model was trained but evaluation encountered an error.")

# Save the model
print("\n" + "="*60)
print("SAVING MODEL AND SCALER")
print("="*60)

try:
    import pickle
    
    # Save model
    model_filename = 'ocsvm_intrusion_model.pkl'
    with open(model_filename, 'wb') as f:
        pickle.dump(ocsvm_model, f)
    print(f"✓ Model saved to: {model_filename}")
    
    # Save scaler
    scaler_filename = 'ocsvm_feature_scaler.pkl'
    with open(scaler_filename, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"✓ Scaler saved to: {scaler_filename}")
    
    # Save feature names for future reference
    feature_names_file = 'feature_names.pkl'
    with open(feature_names_file, 'wb') as f:
        pickle.dump(list(X.columns), f)
    print(f"✓ Feature names saved to: {feature_names_file}")
    
except Exception as e:
    print(f"❌ Error saving files: {e}")

print(f"\n🎉 OCSVM Training Pipeline Completed Successfully!")
print(f"📝 Summary:")
print(f"   • Original data: {combined_data.shape[0]:,} samples")
print(f"   • Clean data: {X.shape[0]:,} samples")
print(f"   • Training samples (benign): {X_train_benign.shape[0]:,}")
print(f"   • Test samples: {X_test.shape[0]:,}")
print(f"   • Features: {X.shape[1]}")
print(f"   • Model: One-Class SVM (RBF kernel)")
