In [None]:
#!which python
import os
print (os.getcwd())

Import Libraries

In [None]:
import xgboost as xgb             #model algorithm
import shap                       #XAI
import sklearn                    #machine learning toolkit ; build and test ML models
import pandas as pd               #panel data system ; load and prepare data
import numpy as np                #numerical python ; fast numeric calculations
import matplotlib.pyplot as plt   # python plotting ; visualisation tool 
import seaborn as sns             # statistical visualisation tool
import warnings

# Show only important warnings, hide deprecation/future ones
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# DEBUGGING- ENABLE THIS WARNING
#warnings.filterwarnings("default")

#Optional 
#print("XGBoost version:", xgb.__version__)
#print("SHAP version:", shap.__version__)
#print("scikit-learn version:", sklearn.__version__)

Load and process Light EDA on raw csv's

In [None]:
normal_df = pd.read_csv("Normal_data.csv")
attack1_df = pd.read_csv("OVS.csv")
attack2_df = pd.read_csv("metasploitable-2.csv")

In [None]:
for df, name in zip([normal_df, attack1_df, attack2_df],
                    ["Normal", "OVS", "Metasploitable2"]):
    print(f"--- {name} ---")
    print(df.shape)
    print(df.info())
    print("Unique labels:", df['Label'].unique())
    print(df['Label'].value_counts())
    print(df.isnull().sum().sum(), "missing values")
   
    if name != "Normal":
        same_cols = df.columns.equals(normal_df.columns)
        print(f"Columns match Normal dataset: {same_cols}")
    print("\n" + "-"*40 + "\n")

Filter attack datasets, combine datasets, label encoding

In [None]:
#Filter DDoS/DoS attack traffic from each attack dataset
attack1_df = attack1_df[attack1_df["Label"].isin(["DoS", "DDoS"])]
attack2_df = attack2_df[attack2_df["Label"].isin(["DoS", "DDoS"])]

#Data integrity check-are attack types filtered ?
print("OVS.csv:")
display(attack1_df.head(10))
print("metasploitable-2.csv:")
display(attack2_df.head(10))

In [None]:
#Shape structure-Data integrity check
print(attack1_df.shape)
print(attack2_df.shape)

In [None]:
for df in [normal_df, attack1_df, attack2_df]:
    df['Label'] = df['Label'].astype(str).str.strip().str.lower()

combined_df = pd.concat([normal_df, attack1_df, attack2_df], ignore_index=True)
combined_df['Label'] = combined_df['Label'].map({'normal': 0, 'dos': 1, 'ddos': 1})

print(combined_df['Label'].unique())
print(combined_df['Label'].value_counts())

Full EDA on combined dataset 

In [None]:
# === FULL EDA START ===
#--BASIC INFO
print("\n--- Basic Info ---")
print(combined_df.shape)
print(combined_df.dtypes.value_counts())
combined_df.info()
display(combined_df.head(20))

#--MISSING VALUES
print("\n--- Missing Values ---")
# Check for missing values per column
total_missing = combined_df.isnull().sum().sum()
print(f"Total missing values in combined_df: {total_missing}")
# --- If any missing values exist, show top columns affected ---
if total_missing > 0:
    print("\nMissing values by column:")
    print(combined_df.isnull().sum().sort_values(ascending=False))

#--CLASS DISTRIBUTIONS
print("\n--- Class Distribution ---")
class_counts = combined_df['Label'].value_counts().sort_index()  # ensures 0 then 1
class_percent = combined_df['Label'].value_counts(normalize=True).sort_index() * 100
majority = class_counts.max()
minority = class_counts.min()
ratio = round(majority / minority, 2)
num_neg = class_counts.get(0, 0)
num_pos = class_counts.get(1, 0)
scale_pos_weight = round(num_neg / num_pos, 2)


summary = pd.DataFrame({
    "Count": class_counts,
    "Percentage (%)": class_percent.round(2)
})
print(" CLASS DISTRIBUTION SUMMARY\n")
print(summary)
print(f"\n Imbalance Ratio (Majority : Minority) = {ratio} : 1")
print(f" Recommended scale_pos_weight for XGBoost = {scale_pos_weight}")

#--SUMMARY STATS
print("\n--- Descriptive Statistics ---")
display(combined_df.describe().T)
display(combined_df.nunique().sort_values(ascending=False).head(20))

#--CORRELATION 
print("\n--- Correlation Analysis ---")
corr = combined_df.select_dtypes(include=[np.number]).corr()
corr_label = corr['Label'].sort_values(ascending=False)
print("\nTop correlated features with Label:")
print(corr_label.head(10))
print("\nLeast correlated features:")
print(corr_label.tail(10))

plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

# --- SAMPLING FOR PLOTTING ---
print("\n--- Sampling for Visualization ---")
plot_sample = combined_df.groupby('Label', group_keys=False).apply(
    lambda x: x.sample(frac=0.1, random_state=42)
)

num_cols = [c for c in plot_sample.select_dtypes(include=[np.number]).columns if c != 'Label'][:10]

for col in num_cols:
  plt.figure()
  sns.histplot(data=plot_sample, x=col, hue='Label', bins=40, kde=False, stat='density', common_norm=False)
  plt.title(f'Distribution: {col}')
  plt.tight_layout() 
  plt.show()
  
for col in num_cols[:5]:
  plt.figure()
  sns.boxplot(data=plot_sample, x='Label', y=col)
  plt.title(f'Boxplot by Label: {col}')
  plt.tight_layout() 
  plt.show()
  
# --- DATASET SUMMARY ---  
print(" Dataset Summary")
print(f" Total Missing values: {combined_df.isnull().sum().sum()}")
print(f"Duplicate rows: {combined_df.duplicated().sum()}")
print("Numeric columns:", len(combined_df.select_dtypes(include=[np.number]).columns))
print("Non-numeric columns:", len(combined_df.select_dtypes(exclude=[np.number]).columns))


In [None]:
#RUN ONLY FOR FULL EDA OUTPUTS

import nbformat
from nbformat.v4 import new_notebook, new_code_cell

# --- Configure these for your project ---
notebook_path = "SDN_MLModel.ipynb"          # <-- rename to your current .ipynb
new_notebook_path = "EDA_only_with_profile.ipynb"   # output notebook name
data_csv = "combined_traffic_.csv"           # fallback CSV to load if combined_df is undefined
marker = "# === FULL EDA START ==="

# --- Load the current notebook ---
with open(notebook_path, "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=4)

eda_cells = []
eda_started = False

for cell in nb.cells:
    if cell.cell_type == "code" and marker in cell.source:
        eda_started = True
    if eda_started:
        eda_cells.append(cell)

if not eda_cells:
    raise RuntimeError(f"Marker not found: {marker}. Add it just before your EDA block.")

# Safety prelude: ensure combined_df exists (or load from CSV)
prelude_code = f"""# Safety prelude: ensure combined_df exists
try:
    combined_df
except NameError:
    import pandas as pd
    print("combined_df not found in scope. Loading from CSV: '{data_csv}'")
    combined_df = pd.read_csv("{data_csv}")
print("combined_df rows/cols:", combined_df.shape)
"""

# Profiling cell (appended to the end)
profile_code = """# Generate ydata-profiling HTML report
try:
    from ydata_profiling import ProfileReport
except Exception as e:
    print("ydata-profiling not available. Install it with:\\n  pip install ydata-profiling")
    raise

profile = ProfileReport(combined_df, title="InSDN Combined Dataset Report", explorative=True)
profile.to_file("EDA_Report.html")
print("EDA_Report.html generated.")
"""

# Build new notebook: prelude + EDA cells + profiling
cells_out = [new_code_cell(prelude_code)] + eda_cells + [new_code_cell(profile_code)]
eda_nb = new_notebook(cells=cells_out)
eda_nb.metadata = nb.metadata  # preserve kernel / language info

with open(new_notebook_path, "w", encoding="utf-8") as f:
    nbformat.write(eda_nb, f)

print(f"Created '{new_notebook_path}' with {len(cells_out)} cells (including prelude + profiling).")

Data Cleaning

In [None]:
# --- DATA CLEANING ---

# 1. Drop duplicates (sanity check)
before = combined_df.shape[0]
combined_df.drop_duplicates(inplace=True)
after = combined_df.shape[0]
print(f"Duplicates removed: {before - after}")

# 2. Drop non-predictive / identifier columns (if present)
drop_cols = ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp']
existing_cols = [c for c in drop_cols if c in combined_df.columns]

if existing_cols:
    combined_df.drop(columns=existing_cols, inplace=True)
    print(f"Dropped columns: {existing_cols}")
else:
    print("No identifier columns found to drop.")

# 3. Confirm binary labels (0 = Normal, 1 = Attack)
print("\nLabel value counts after encoding:")
print(combined_df['Label'].value_counts())

# 4. Quick check for residual missing data
print(f"\nMissing values remaining: {combined_df.isnull().sum().sum()}")

In [None]:
# ===== FIX LEAKAGE: Content-level dedup + rebuild pipeline =====

# 1) Deduplicate by content (hash of all columns) & reset index
before = combined_df.shape[0]
combined_df = combined_df.loc[
    ~pd.util.hash_pandas_object(combined_df, index=False).duplicated()
].reset_index(drop=True)
after = combined_df.shape[0]
print(f"Content-level duplicates removed: {before - after}")
print(f"Shape after content-dedup: {combined_df.shape}")

# 2) Rebuild X/y from the cleaned dataframe
y = combined_df['Label']
X = combined_df.drop(columns=['Label'])

# 3) Re-scale features from scratch
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# 4) Recompute correlation reduction (|corr| >= 0.9)
corr = X_scaled_df.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_corr_features = [c for c in upper.columns if any(upper[c] >= 0.9)]

print(f"Correlated features to drop (≥0.9): {len(high_corr_features)}")
X_refined_df = X_scaled_df.drop(columns=high_corr_features)
print(f"Final feature matrix after reduction: {X_refined_df.shape}")

# 5) Re-split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_refined_df, y,
    test_size=0.2, stratify=y, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Testing set:  {X_test.shape}")

# 6) Re-check for any overlap between train and test (should be 0 now)
train_hashes = pd.util.hash_pandas_object(X_train, index=False)
test_hashes  = pd.util.hash_pandas_object(X_test,  index=False)
overlap_count = np.intersect1d(train_hashes.values, test_hashes.values).size
print(f"Row overlap between train and test: {overlap_count}")

Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler

# --- FEATURE SCALING ---

# 1. Separate features (X) and label (y)
X = combined_df.drop(columns=['Label'])
y = combined_df['Label']

# 2. Scale numeric columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame (retain column names)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print(f"Scaled feature matrix shape: {X_scaled_df.shape}")
print(f"Scaled feature sample:\n{X_scaled_df.head(5)}")

# --- CORRELATION REDUCTION ---

# 3. Compute correlation matrix
corr_matrix = X_scaled_df.corr().abs()

# 4. Identify highly correlated pairs (|corr| >= 0.9)
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] >= 0.9)]

print(f"\nHighly correlated features to drop (|corr| ≥ 0.9):")
print(high_corr_features)

# 5. Drop correlated features (optional — safe for model simplification)
X_refined_df = X_scaled_df.drop(columns=high_corr_features)
print(f"\nFinal feature matrix shape after correlation reduction: {X_refined_df.shape}")

In [None]:
from sklearn.model_selection import train_test_split

# --- DATA PARTITIONING ---

# Use the refined, scaled dataset
X = X_refined_df
y = y  # already defined earlier (Label column)

# 1. Split into train/test (80/20) with stratification to maintain class ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 2. Display class distribution in each split
train_dist = y_train.value_counts(normalize=True) * 100
test_dist = y_test.value_counts(normalize=True) * 100

print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Testing set:  {X_test.shape[0]} samples, {X_test.shape[1]} features")

print("\nClass distribution (%):")
print(pd.DataFrame({
    'Train (%)': train_dist.round(2),
    'Test (%)': test_dist.round(2)
}))

In [None]:
#CONFIRMS NO DATA LEAKAGE

train_hashes = pd.util.hash_pandas_object(X_train, index=False)
test_hashes  = pd.util.hash_pandas_object(X_test, index=False)

overlap_count = np.intersect1d(train_hashes.values, test_hashes.values).size
print(f"Row overlap between train and test: {overlap_count}")

Model Build

In [None]:
# --- BASELINE XGBOOST TRAINING & EVALUATION ---

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

# 1. Define model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

# 2. Train the model
xgb_model.fit(X_train, y_train)

# 3. Predictions
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# 4. Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# 5. Display results
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'],
    'Score': [accuracy, precision, recall, f1, roc_auc]
})
print("\nModel Performance Metrics:\n")
print(results.to_string(index=False))

# 6. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal (0)', 'Attack (1)'])
disp.plot(cmap='Blues', values_format='d')
plt.title("Confusion Matrix - Baseline XGBoost")
plt.show()

# 7. ROC Curve
RocCurveDisplay.from_estimator(xgb_model, X_test, y_test)
plt.title("ROC Curve - Baseline XGBoost")
plt.show()

In [None]:

#LEAKAGE SANITY CHECKS

# Invariants
assert 'Label' not in X_refined_df.columns, "Label leaked into features!"
print("No 'Label' column in features")

# Train/test overlap (should be 0)
train_hash = pd.util.hash_pandas_object(X_train, index=False).values
test_hash  = pd.util.hash_pandas_object(X_test,  index=False).values
overlap = np.intersect1d(train_hash, test_hash).size
print(f"Row overlap between train and test: {overlap}")

# Binary-like columns suspiciously similar to Label
bin_like = [c for c in X_refined_df.columns if set(np.unique(X_refined_df[c])).issubset({0,1})]
if bin_like:
    eq_rates = {c: (X_refined_df[c].values == y.values).mean() for c in bin_like}
    sus = sorted(eq_rates.items(), key=lambda x: x[1], reverse=True)[:10]
    print("Binary-like columns most similar to Label:", sus)
else:
    print("No binary-like feature columns detected.")

# Single-feature “too perfect” scorers (AUC per feature on TRAIN)
_auc = {}
for c in X_train.columns:
    try:
        auc = roc_auc_score(y_train, X_train[c])
        _auc[c] = auc
    except Exception:
        pass

top_auc = sorted(_auc.items(), key=lambda x: abs(x[1]-0.5), reverse=True)[:10]
print("Top single-feature AUCs on TRAIN (watch for ~1.0 or ~0.0):")
for k,v in top_auc:
    print(f"{k}: {v:.4f}")

In [None]:
#XGBOOST FEATURE IMPORTANCE

fi = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

print("Top 20 features by importance:\n")
print(fi.head(20))

plt.figure(figsize=(8,6))
fi.head(20).sort_values().plot(kind='barh')
plt.title("Top 20 Feature Importances (XGBoost)")
plt.tight_layout()
plt.show()

In [None]:
#SHAP EXPLAINABILITY

# Sample for speed
shap_sample = X_test.sample(n=min(5000, len(X_test)), random_state=42)

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(shap_sample)

# Global summary
shap.summary_plot(shap_values, shap_sample, show=True)

# Dependence on top feature from feature importance
top_feat = fi.index[0]
shap.dependence_plot(top_feat, shap_values, shap_sample, show=True)

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Use the same columns you ended up with after correlation pruning
selected_cols = list(X_refined_df.columns)

# Build UN-SCALED feature matrix with those columns
# (combined_df is your cleaned, deduped dataframe before scaling)
X_unscaled = combined_df[selected_cols]
y_binary   = combined_df['Label']

pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),              # fit per fold (no leakage)
    ('clf', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(
    pipe, X_unscaled, y_binary,
    cv=cv,
    scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
    n_jobs=-1
)

cv_results = pd.DataFrame({
    'accuracy':  scores['test_accuracy'],
    'precision': scores['test_precision'],
    'recall':    scores['test_recall'],
    'f1':        scores['test_f1'],
    'roc_auc':   scores['test_roc_auc'],
})

print("\n5-Fold CV Results (mean ± std):\n")
print(cv_results.mean().round(6).astype(str) + " ± " + cv_results.std().round(6).astype(str))

In [None]:
# --- SHAP INTERACTION ANALYSIS  ---

# 0) Sample for speed (keeps class ratio if X_test is stratified)
shap_sample = X_test.sample(n=min(4000, len(X_test)), random_state=42)

# 1) Build explainer for XGBoost
explainer = shap.TreeExplainer(xgb_model)

# 2) Compute standard SHAP values (2-D) for dependence plots
#    shape: (n_samples, n_features)
shap_values = explainer.shap_values(shap_sample)

# 3) Compute interaction values (3-D) for interaction summaries
#    shape: (n_samples, n_features, n_features)
interaction_values = explainer.shap_interaction_values(shap_sample)

# If SHAP returns a list (e.g., multiclass), take the first element
if isinstance(interaction_values, list):
    interaction_values = interaction_values[0]
if isinstance(shap_values, list):
    shap_values = shap_values[0]

# 4) Global interaction summary plot (uses 3-D interaction tensor)
shap.summary_plot(interaction_values, shap_sample, show=True)

# 5) Rank top interacting feature pairs by mean |interaction|
# interaction_values: (n_samples, n_features, n_features)
abs_inter = np.abs(interaction_values).mean(axis=0)   # avg over samples
np.fill_diagonal(abs_inter, 0.0)                      # ignore self-interactions

feature_names = list(shap_sample.columns)
pairs = []
for i in range(len(feature_names)):
    for j in range(i + 1, len(feature_names)):
        pairs.append((feature_names[i], feature_names[j], float(abs_inter[i, j])))

top_pairs = (
    pd.DataFrame(pairs, columns=["Feature A", "Feature B", "Mean |Interaction|"])
      .sort_values("Mean |Interaction|", ascending=False)
      .head(10)
)

print("\nTop 10 feature–feature interactions (by mean |interaction|):\n")
print(top_pairs.to_string(index=False))

# 6) Visualize the strongest interaction with a dependence plot
#    IMPORTANT: dependence_plot expects 2-D shap_values; we pass the partner as interaction_index
topA, topB = top_pairs.iloc[0]["Feature A"], top_pairs.iloc[0]["Feature B"]
shap.dependence_plot(topA, shap_values, shap_sample, interaction_index=topB, show=True)

# 7) Optional: Heatmap of interaction strengths for the top M features
M = min(12, len(feature_names))
total_inter_strength = abs_inter.sum(axis=0)
top_idx = np.argsort(-total_inter_strength)[:M]

plt.figure(figsize=(8, 6))
plt.imshow(abs_inter[np.ix_(top_idx, top_idx)], aspect='auto', cmap='coolwarm')
plt.colorbar(label='Mean |interaction|', fraction=0.046, pad=0.04)
plt.xticks(range(M), [feature_names[i] for i in top_idx], rotation=90)
plt.yticks(range(M), [feature_names[i] for i in top_idx])
plt.title("Top Feature–Feature Interaction Strengths", fontsize=13)
plt.tight_layout()
plt.show()

# 8) Optional sanity checks
assert shap_sample.shape[0] == shap_values.shape[0], "Mismatch: samples vs shap_values rows."
assert shap_sample.shape[1] == shap_values.shape[1], "Mismatch: features vs shap_values cols."
assert (
    interaction_values.shape[0] == shap_sample.shape[0]
    and interaction_values.shape[1] == shap_sample.shape[1]
    and interaction_values.shape[2] == shap_sample.shape[1]
), "Interaction tensor shape is unexpected."