In [None]:
pip install pandas numpy scikit-learn lightgbm




In [None]:
# Cell 0 — Imports
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# If you haven't installed imbalanced-learn yet, run the next cell once:
# !pip install imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler


In [None]:
file_path = "/content/drive/MyDrive/Bootcamp_ML_Data_Science/Capstone_Project/model_with_out_source_data/data_extraction/merged_shuffled_20250822_185836.csv"


In [None]:
head_df = pd.read_csv(file_path, nrows=5000)

  head_df = pd.read_csv(file_path, nrows=5000)


In [None]:
target_col = "Label"
selected_attacks = ["SSDP", "Evil_Twin", "Krack", "Deauth", "(Re)Assoc", "RogueAP"]

In [None]:
cols = head_df.columns.tolist()
assert target_col in cols, "Label not found!"

allow_prefixes = ("frame.", "radiotap.", "wlan.", "wlan_radio.")

block_exact = set([
    "frame.number", "frame.time", "frame.time_epoch",
    "radiotap.mactime", "radiotap.present.tsft", "radiotap.timestamp.ts",
    "wlan.bssid","wlan.da","wlan.ra","wlan.sa","wlan.ta","wlan.ssid","wlan.tag","wlan.tag.length",
    "wlan.analysis.kck","wlan.analysis.kek","wlan.rsn.ie.gtk.key","wlan.rsn.ie.igtk.key","wlan.rsn.ie.pmkid",
    "wlan.fixed.timestamp","wlan_rsna_eapol.keydes.msgnr","wlan_rsna_eapol.keydes.data",
    "wlan_rsna_eapol.keydes.data_len","wlan_rsna_eapol.keydes.key_info.key_mic","wlan_rsna_eapol.keydes.nonce",
])

block_patterns = [
    r"(?i)\b(bssid|ssid|mac|addr|oui|vendor|station|ra|ta|sa|da)\b",
    r"(?i)^(ip\.|ipv6\.|arp|tcp\.|udp\.|dns|http|json|ssh|tls|smb2?|nbns|nbss|ldap|dhcp|mdns|ssdp)\b",
    r"(?i)(payload|data\.data|llc|eapol|key|nonce|pmkid|gtk|igtk|kck|kek)",
    r"(?i)(pcap|source_file|capture|interface)",
    r"(?i)(start_tsf|end_tsf|timestamp)",
]

def allowed(col: str) -> bool:
    if col == target_col:
        return True
    low = col.lower()
    if not low.startswith(allow_prefixes):
        return False
    if col in block_exact:
        return False
    for pat in block_patterns:
        if re.search(pat, low):
            return False
    return True

In [None]:
# Cell 3 — STEP 2: Compute keep_cols and load full data with only those
keep_cols = [c for c in cols if allowed(c)]
if target_col not in keep_cols:
    keep_cols.append(target_col)

print("عدد الأعمدة قبل:", len(cols), "| بعد:", len(keep_cols))
print("أول 30 عمود:", keep_cols[:30])

df = pd.read_csv(file_path, usecols=keep_cols)
df.replace("?", np.nan, inplace=True)


عدد الأعمدة قبل: 255 | بعد: 39
أول 30 عمود: ['frame.encap_type', 'frame.len', 'frame.time_delta', 'frame.time_delta_displayed', 'frame.time_relative', 'radiotap.channel.flags.cck', 'radiotap.channel.flags.ofdm', 'radiotap.channel.freq', 'radiotap.datarate', 'radiotap.dbm_antsignal', 'radiotap.length', 'radiotap.rxflags', 'radiotap.vendor_oui', 'wlan.duration', 'wlan.country_info.fnm', 'wlan.country_info.code', 'wlan.fc.ds', 'wlan.fc.frag', 'wlan.fc.order', 'wlan.fc.moredata', 'wlan.fc.protected', 'wlan.fc.pwrmgt', 'wlan.fc.type', 'wlan.fc.retry', 'wlan.fc.subtype', 'wlan.fcs.bad_checksum', 'wlan.fixed.beacon', 'wlan.fixed.capabilities.ess', 'wlan.fixed.capabilities.ibss', 'wlan.fixed.reason_code']


  df = pd.read_csv(file_path, usecols=keep_cols)


In [None]:
# Cell 4 — STEP 3: Filter to Normal + selected attacks, map to binary (0/1)
mask = (df[target_col] == "Normal") | (df[target_col].isin(selected_attacks))
df = df.loc[mask].copy()

# Binary target: 0 = Normal, 1 = Attack
df["Binary_Label"] = (df[target_col] != "Normal").astype(int)

# Separate features/target
feature_cols = [c for c in df.columns if c not in [target_col, "Binary_Label"]]
X = df[feature_cols].copy()
y = df["Binary_Label"].copy()

print("Class counts (overall):")
print(y.value_counts().rename({0: "Normal", 1: "Attack"}))


Class counts (overall):
Binary_Label
Normal    473728
Attack     69710
Name: count, dtype: int64


In [None]:
# Cell ROS-1 — Install imbalanced-learn (run once if needed)
# !pip install imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Target 85% Normal / 15% Attack:
desired_attack_prop = 0.15
ratio = desired_attack_prop / (1 - desired_attack_prop)  # ≈ 0.17647

ros = RandomOverSampler(sampling_strategy=ratio, random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

print("Overall counts AFTER ROS to ~85/15:")
print(y_bal.value_counts().rename({0: "Normal", 1: "Attack"}))


Overall counts AFTER ROS to ~85/15:
Binary_Label
Normal    473728
Attack     83599
Name: count, dtype: int64


In [None]:
# Cell SPLIT — Train/Test split AFTER ROS (keep stratification)
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(
    X_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)

print("Train counts:")
print(y_tr.value_counts().rename({0: "Normal", 1: "Attack"}))
print("\nTest counts:")
print(y_te.value_counts().rename({0: "Normal", 1: "Attack"}))


Train counts:
Binary_Label
Normal    378982
Attack     66879
Name: count, dtype: int64

Test counts:
Binary_Label
Normal    94746
Attack    16720
Name: count, dtype: int64


In [None]:
X_tr.shape

(445861, 31)

In [None]:
# Cell PRE-5 — Coerce mixed columns (numeric vs categorical) + drop almost-all-NaN
import pandas as pd
import numpy as np
import re

def coerce_mixed_columns(df_train, df_test, hex_ok=True, thresh=0.8):
    df_train = df_train.copy()
    df_test  = df_test.copy()
    num_cols, cat_cols = [], []
    for c in df_train.columns:
        s_tr, s_te = df_train[c], df_test[c]
        if s_tr.dtype == object or str(s_tr.dtype).startswith("string"):
            st_tr = s_tr.astype(str)
            if hex_ok:
                st_tr_num = pd.to_numeric(st_tr.str.replace(r"^\s*0x", "", regex=True), errors="coerce")
            else:
                st_tr_num = pd.to_numeric(st_tr, errors="coerce")
            if st_tr_num.notna().mean() >= thresh:
                st_te = s_te.astype(str)
                st_te_num = pd.to_numeric(st_te.str.replace(r"^\s*0x", "", regex=True), errors="coerce")
                df_train[c] = st_tr_num.astype("float32")
                df_test[c]  = st_te_num.astype("float32")
                num_cols.append(c)
            else:
                df_train[c] = st_tr.astype("string")
                df_test[c]  = s_te.astype("string")
                cat_cols.append(c)
        else:
            num_cols.append(c)
    return df_train, df_test, num_cols, cat_cols

X_tr, X_te, num_cols, cat_cols = coerce_mixed_columns(X_tr, X_te, hex_ok=True, thresh=0.8)

# Drop columns that are almost all NaN in TRAIN (mirror on TEST)
na_ratio = X_tr.isna().mean()
drop_almost = na_ratio[na_ratio > 0.98].index.tolist()
if drop_almost:
    X_tr.drop(columns=drop_almost, inplace=True, errors="ignore")
    X_te.drop(columns=[c for c in drop_almost if c in X_te.columns], inplace=True, errors="ignore")
    num_cols = [c for c in num_cols if c not in drop_almost]
    cat_cols = [c for c in cat_cols if c not in drop_almost]

print("بعد التوحيد → رقمية:", len(num_cols), "فئوية:", len(cat_cols))


بعد التوحيد → رقمية: 29 فئوية: 2


In [None]:
# Cell PRE-5b — Inspect columns with missing values
null_counts = X_tr.isna().sum()
null_counts = null_counts[null_counts > 0].sort_values(ascending=False)

print("Features with missing values (train):")
print(null_counts)

print("\nTotal columns with nulls:", len(null_counts))


Features with missing values (train):
wlan.seq             211062
radiotap.datarate    198785
dtype: int64

Total columns with nulls: 2


In [None]:
# Cell PRE-5c — Show percentage of missing values
null_ratio = (X_tr.isna().mean() * 100).sort_values(ascending=False)
null_ratio = null_ratio[null_ratio > 0]

print("Features with missing values (% of rows):")
print(null_ratio.round(2))


Features with missing values (% of rows):
wlan.seq             47.34
radiotap.datarate    44.58
dtype: float64


In [None]:
# Cell 6a — Impute numeric with median (fit on TRAIN)
from sklearn.impute import SimpleImputer
import pandas as pd

num_imputer = SimpleImputer(strategy="median")

if num_cols:
    X_tr_num = pd.DataFrame(
        num_imputer.fit_transform(X_tr[num_cols]),
        columns=num_cols, index=X_tr.index
    )
    X_te_num = pd.DataFrame(
        num_imputer.transform(X_te[num_cols]),
        columns=num_cols, index=X_te.index
    )
else:
    X_tr_num = pd.DataFrame(index=X_tr.index)
    X_te_num = pd.DataFrame(index=X_te.index)

print("Numeric imputed — shapes:", X_tr_num.shape, X_te_num.shape)


Numeric imputed — shapes: (445861, 29) (111466, 29)


In [None]:
# Cell 6b — Standardize numeric (z-score), fit on TRAIN
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=True, with_std=True)

if num_cols:
    X_tr_num_scaled = pd.DataFrame(
        scaler.fit_transform(X_tr_num),
        columns=num_cols, index=X_tr.index
    )
    X_te_num_scaled = pd.DataFrame(
        scaler.transform(X_te_num),
        columns=num_cols, index=X_te.index
    )
else:
    X_tr_num_scaled = X_tr_num.copy()
    X_te_num_scaled = X_te_num.copy()

print("Numeric standardized — shapes:", X_tr_num_scaled.shape, X_te_num_scaled.shape)


Numeric standardized — shapes: (445861, 29) (111466, 29)


In [None]:
# Cell 6c — Align categoricals across train/test (no encoding yet)
import pandas as pd

def _align_cats(tr: pd.Series, te: pd.Series, na_token="__NA__"):
    tr = tr.astype("string").fillna(na_token)
    te = te.astype("string").fillna(na_token)
    cats = pd.Index(sorted(pd.unique(pd.concat([tr, te], ignore_index=True))), name=tr.name)
    tr = tr.astype(pd.CategoricalDtype(categories=cats, ordered=False))
    te = te.astype(pd.CategoricalDtype(categories=cats, ordered=False))
    return tr, te

cat_tr_list, cat_te_list = [], []
for c in cat_cols:
    s_tr, s_te = _align_cats(X_tr[c], X_te[c])
    cat_tr_list.append(s_tr.to_frame())
    cat_te_list.append(s_te.to_frame())

X_tr_cat = pd.concat(cat_tr_list, axis=1) if cat_tr_list else pd.DataFrame(index=X_tr.index)
X_te_cat = pd.concat(cat_te_list, axis=1) if cat_te_list else pd.DataFrame(index=X_te.index)

print("Categoricals aligned — shapes:", X_tr_cat.shape, X_te_cat.shape)


Categoricals aligned — shapes: (445861, 2) (111466, 2)


In [None]:
# Cell 6d — Final preprocessed frames (numeric scaled + categorical aligned)
import pandas as pd

X_tr_pre = pd.concat([X_tr_num_scaled, X_tr_cat], axis=1)
X_te_pre = pd.concat([X_te_num_scaled, X_te_cat], axis=1)

print("Final shapes:")
print("X_tr_pre:", X_tr_pre.shape, "| X_te_pre:", X_te_pre.shape)
print("y_tr:", y_tr.shape, "| y_te:", y_te.shape)

print("\nDtypes summary (train):")
print(X_tr_pre.dtypes.value_counts())

# Quick NA check after imputation (numerics should be clean)
print("\nAny NaNs left?  train:", X_tr_pre.isna().any().any(), " | test:", X_te_pre.isna().any().any())

# Peek a few columns
print("\nSample columns:", list(X_tr_pre.columns[:]))


Final shapes:
X_tr_pre: (445861, 31) | X_te_pre: (111466, 31)
y_tr: (445861,) | y_te: (111466,)

Dtypes summary (train):
float64     29
category     1
category     1
Name: count, dtype: int64

Any NaNs left?  train: False  | test: False

Sample columns: ['frame.encap_type', 'frame.len', 'frame.time_delta', 'frame.time_delta_displayed', 'frame.time_relative', 'radiotap.channel.flags.cck', 'radiotap.channel.flags.ofdm', 'radiotap.channel.freq', 'radiotap.datarate', 'radiotap.dbm_antsignal', 'radiotap.length', 'radiotap.rxflags', 'wlan.duration', 'wlan.fc.ds', 'wlan.fc.frag', 'wlan.fc.order', 'wlan.fc.moredata', 'wlan.fc.protected', 'wlan.fc.pwrmgt', 'wlan.fc.type', 'wlan.fc.retry', 'wlan.fc.subtype', 'wlan_radio.duration', 'wlan.seq', 'wlan_radio.channel', 'wlan_radio.data_rate', 'wlan_radio.frequency', 'wlan_radio.signal_dbm', 'wlan_radio.phy', 'wlan.country_info.fnm', 'wlan.country_info.code']


In [None]:
# Show all column names in order
print("\nAll feature columns (train):")
for i, c in enumerate(X_tr_pre.columns, start=1):
    print(f"{i:3d}. {c}")

# Or, if you want as a DataFrame (easier to scroll in Jupyter):
pd.DataFrame({"Feature": X_tr_pre.columns.tolist()})



All feature columns (train):
  1. frame.encap_type
  2. frame.len
  3. frame.time_delta
  4. frame.time_delta_displayed
  5. frame.time_relative
  6. radiotap.channel.flags.cck
  7. radiotap.channel.flags.ofdm
  8. radiotap.channel.freq
  9. radiotap.datarate
 10. radiotap.dbm_antsignal
 11. radiotap.length
 12. radiotap.rxflags
 13. wlan.duration
 14. wlan.fc.ds
 15. wlan.fc.frag
 16. wlan.fc.order
 17. wlan.fc.moredata
 18. wlan.fc.protected
 19. wlan.fc.pwrmgt
 20. wlan.fc.type
 21. wlan.fc.retry
 22. wlan.fc.subtype
 23. wlan_radio.duration
 24. wlan.seq
 25. wlan_radio.channel
 26. wlan_radio.data_rate
 27. wlan_radio.frequency
 28. wlan_radio.signal_dbm
 29. wlan_radio.phy
 30. wlan.country_info.fnm
 31. wlan.country_info.code


Unnamed: 0,Feature
0,frame.encap_type
1,frame.len
2,frame.time_delta
3,frame.time_delta_displayed
4,frame.time_relative
5,radiotap.channel.flags.cck
6,radiotap.channel.flags.ofdm
7,radiotap.channel.freq
8,radiotap.datarate
9,radiotap.dbm_antsignal


In [None]:
# LGB-0 — Install (only if needed)
# !pip install lightgbm
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score, precision_recall_fscore_support
)


In [None]:
# LGB-1 — Build LightGBM Datasets (native categorical support)
# cat_cols should be the SAME categorical column names you had earlier.
# If you don't still have `cat_cols`, we can infer from dtypes:
cat_cols_in_pre = [c for c in X_tr_pre.columns if str(X_tr_pre[c].dtype) == "category"]

print("Categorical columns LightGBM will treat as categorical:", cat_cols_in_pre)

lgb_train = lgb.Dataset(
    X_tr_pre, label=y_tr,
    categorical_feature=cat_cols_in_pre, free_raw_data=False
)
lgb_valid = lgb.Dataset(
    X_te_pre, label=y_te,
    categorical_feature=cat_cols_in_pre, reference=lgb_train, free_raw_data=False
)


Categorical columns LightGBM will treat as categorical: ['wlan.country_info.fnm', 'wlan.country_info.code']


In [None]:
# LGB-2 — Train (fixed for new LightGBM API)
from lightgbm import early_stopping, log_evaluation

params = {
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "learning_rate": 0.05,
    "num_leaves": 64,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 2.0,
    "verbosity": -1,
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=["train", "valid"],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=75, verbose=True),
        log_evaluation(period=50)
    ]
)

print("Best iteration:", model.best_iteration)


Training until validation scores don't improve for 75 rounds
[50]	train's auc: 0.99999	train's binary_logloss: 0.0235319	valid's auc: 0.999992	valid's binary_logloss: 0.0233851
[100]	train's auc: 0.999993	train's binary_logloss: 0.00380283	valid's auc: 0.999994	valid's binary_logloss: 0.00375417
[150]	train's auc: 0.999994	train's binary_logloss: 0.00195624	valid's auc: 0.999994	valid's binary_logloss: 0.00203091
[200]	train's auc: 0.999996	train's binary_logloss: 0.00162455	valid's auc: 0.999995	valid's binary_logloss: 0.00182128
[250]	train's auc: 0.999997	train's binary_logloss: 0.00146958	valid's auc: 0.999995	valid's binary_logloss: 0.00179399
[300]	train's auc: 0.999998	train's binary_logloss: 0.00137063	valid's auc: 0.999994	valid's binary_logloss: 0.00181892
Early stopping, best iteration is:
[245]	train's auc: 0.999997	train's binary_logloss: 0.00148164	valid's auc: 0.999995	valid's binary_logloss: 0.00179493
Best iteration: 245


In [None]:
# LGB-3 — Evaluate @ default threshold 0.50
import numpy as np
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score
)

best_iter = model.best_iteration if getattr(model, "best_iteration", None) else model.current_iteration()
y_prob = model.predict(X_te_pre, num_iteration=best_iter)
y_pred = (y_prob >= 0.50).astype(int)

print("\nConfusion matrix (thr=0.50):")
print(confusion_matrix(y_te, y_pred))

print("\nClassification report (thr=0.50):")
print(classification_report(y_te, y_pred, digits=4))

roc = roc_auc_score(y_te, y_prob)
pr_auc = average_precision_score(y_te, y_prob)
print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f}")



Confusion matrix (thr=0.50):
[[94681    65]
 [   12 16708]]

Classification report (thr=0.50):
              precision    recall  f1-score   support

           0     0.9999    0.9993    0.9996     94746
           1     0.9961    0.9993    0.9977     16720

    accuracy                         0.9993    111466
   macro avg     0.9980    0.9993    0.9986    111466
weighted avg     0.9993    0.9993    0.9993    111466

ROC-AUC: 1.0000 | PR-AUC: 1.0000


In [None]:
# LGB-4 — Threshold sweep to maximize F1 for Attack (class=1)
from sklearn.metrics import precision_recall_fscore_support

best_thr, best_f1 = 0.5, -1.0
thr_grid = np.linspace(0.10, 0.90, 33)
for thr in thr_grid:
    pred = (y_prob >= thr).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(y_te, pred, average="binary", zero_division=0)
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"Best F1 for Attack ≈ {best_f1:.4f} at threshold = {best_thr:.2f}")

# Metrics at the optimal threshold
y_pred_opt = (y_prob >= best_thr).astype(int)
print("\nConfusion matrix (optimal threshold):")
print(confusion_matrix(y_te, y_pred_opt))
print("\nClassification report (optimal threshold):")
print(classification_report(y_te, y_pred_opt, digits=4))


Best F1 for Attack ≈ 0.9978 at threshold = 0.40

Confusion matrix (optimal threshold):
[[94677    69]
 [    6 16714]]

Classification report (optimal threshold):
              precision    recall  f1-score   support

           0     0.9999    0.9993    0.9996     94746
           1     0.9959    0.9996    0.9978     16720

    accuracy                         0.9993    111466
   macro avg     0.9979    0.9995    0.9987    111466
weighted avg     0.9993    0.9993    0.9993    111466



In [None]:
# LGB-5 — Top features (by gain)
importances = model.feature_importance(importance_type="gain")
feat_names = X_tr_pre.columns.tolist()
topk = max(30, len(feat_names))
fi_sorted = sorted(zip(feat_names, importances), key=lambda x: x[1], reverse=True)[:topk]

print(f"\nTop {topk} features (by gain):")
for i, (f, imp) in enumerate(fi_sorted, 1):
    print(f"{i:2d}. {f:40s} {imp:.2f}")



Top 30 features (by gain):
 1. frame.time_relative                      1294698.46
 2. wlan_radio.signal_dbm                    476506.52
 3. wlan_radio.duration                      391373.77
 4. radiotap.dbm_antsignal                   289065.31
 5. radiotap.channel.freq                    275291.66
 6. frame.len                                111145.24
 7. wlan.fc.subtype                          65087.00
 8. wlan.duration                            61635.82
 9. wlan.fc.protected                        35384.21
10. wlan.fc.type                             18731.22
11. wlan_radio.channel                       16019.36
12. wlan_radio.data_rate                     15168.32
13. wlan.seq                                 9864.91
14. radiotap.datarate                        7848.58
15. wlan_radio.phy                           6991.60
16. frame.time_delta                         3587.90
17. radiotap.length                          3351.10
18. wlan.country_info.fnm                    2001.69

In [None]:
best_thr

np.float64(0.4)

In [None]:
# LGB-6 — Save bundle for inference (preprocessors + model)
from joblib import dump

cat_cols_in_pre = [c for c in X_tr_pre.columns if str(X_tr_pre[c].dtype) == "category"]
bundle = {
    "imputer": num_imputer,
    "scaler": scaler,
    "num_cols": num_cols,
    "cat_cols": cat_cols_in_pre,
    "feature_order": X_tr_pre.columns.tolist(),
    "best_threshold": float(best_thr),
    "model": model,
    "best_iteration": int(best_iter),
}

dump(bundle, "binary_ids_lightgbm_bundle.joblib")
print("Saved -> binary_ids_lightgbm_bundle.joblib")


Saved -> binary_ids_lightgbm_bundle.joblib
