In [None]:
!pip -q install lightgbm==4.3.0

import json, re, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
import joblib

pd.set_option("display.max_columns", 200)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
file_path = "/content/merged_shuffled_20250822_185836.csv"
target_col = "Label"

head_df = pd.read_csv(file_path, nrows=5000)
cols = head_df.columns.tolist()
assert target_col in cols, "Label not found!"

allow_prefixes = ("frame.", "radiotap.", "wlan.", "wlan_radio.")

block_exact = set([
    "frame.number", "frame.time", "frame.time_epoch",
    "radiotap.mactime", "radiotap.present.tsft", "radiotap.timestamp.ts",
    "wlan.bssid","wlan.da","wlan.ra","wlan.sa","wlan.ta","wlan.ssid","wlan.tag","wlan.tag.length",
    "wlan.analysis.kck","wlan.analysis.kek","wlan.rsn.ie.gtk.key","wlan.rsn.ie.igtk.key","wlan.rsn.ie.pmkid",
    "wlan.fixed.timestamp","wlan_rsna_eapol.keydes.msgnr","wlan_rsna_eapol.keydes.data",
    "wlan_rsna_eapol.keydes.data_len","wlan_rsna_eapol.keydes.key_info.key_mic","wlan_rsna_eapol.keydes.nonce",
])

block_patterns = [
    r"(?i)\b(bssid|ssid|mac|addr|oui|vendor|station|ra|ta|sa|da)\b",
    r"(?i)^(ip\.|ipv6\.|arp|tcp\.|udp\.|dns|http|json|ssh|tls|smb2?|nbns|nbss|ldap|dhcp|mdns|ssdp)\b",
    r"(?i)(payload|data\.data|llc|eapol|key|nonce|pmkid|gtk|igtk|kck|kek)",
    r"(?i)(pcap|source_file|capture|interface)",
    r"(?i)(start_tsf|end_tsf|timestamp)",
]

def allowed(col: str) -> bool:
    if col == target_col:
        return True
    low = col.lower()
    if not low.startswith(allow_prefixes):
        return False
    if col in block_exact:
        return False
    for pat in block_patterns:
        if re.search(pat, low):
            return False
    return True

keep_cols = [c for c in cols if allowed(c)]
if target_col not in keep_cols:
    keep_cols.append(target_col)

print("عدد الأعمدة قبل:", len(cols), "| بعد:", len(keep_cols))
print("أول 30 عمود:", keep_cols[:30])

NameError: name 'pd' is not defined

In [None]:
df = pd.read_csv(file_path, usecols=keep_cols)
df.replace("?", np.nan, inplace=True)

print("شكل البيانات:", df.shape)
print("عدد الأعمدة:", len(df.columns))
df.head(3)

y_bin = (df[target_col].astype(str).str.lower() != "normal").astype(int)
X = df.drop(columns=[target_col])

from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y_bin, test_size=0.2, stratify=y_bin, random_state=42
)

print("Train size:", X_tr.shape, "Test size:", X_te.shape)

cat_cols = X_tr.select_dtypes(include=["object","category"]).columns.tolist()
num_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
print("أعمدة رقمية:", len(num_cols), "| فئوية:", len(cat_cols))

  df = pd.read_csv(file_path, usecols=keep_cols)


شكل البيانات: (578678, 39)
عدد الأعمدة: 39
Train size: (462942, 38) Test size: (115736, 38)
أعمدة رقمية: 34 | فئوية: 4


In [None]:
#  Encoder
def coerce_mixed_columns(df_train, df_test, hex_ok=True, thresh=0.8):
    df_train = df_train.copy()
    df_test  = df_test.copy()
    num_cols, cat_cols = [], []
    for c in df_train.columns:
        s_tr, s_te = df_train[c], df_test[c]
        if s_tr.dtype == object or str(s_tr.dtype).startswith("string"):
            st_tr = s_tr.astype(str)
            if hex_ok:
                st_tr_num = pd.to_numeric(st_tr.str.replace(r"^\s*0x", "", regex=True), errors="coerce")
            else:
                st_tr_num = pd.to_numeric(st_tr, errors="coerce")
            if st_tr_num.notna().mean() >= thresh:
                st_te = s_te.astype(str)
                st_te_num = pd.to_numeric(st_te.str.replace(r"^\s*0x", "", regex=True), errors="coerce")
                df_train[c] = st_tr_num.astype("float32")
                df_test[c]  = st_te_num.astype("float32")
                num_cols.append(c)
            else:
                df_train[c] = s_tr.astype("string")
                df_test[c]  = s_te.astype("string")
                cat_cols.append(c)
        else:
            num_cols.append(c)
    return df_train, df_test, num_cols, cat_cols

X_tr, X_te, num_cols, cat_cols = coerce_mixed_columns(X_tr, X_te, hex_ok=True, thresh=0.8)

na_ratio = X_tr.isna().mean()
drop_almost = na_ratio[na_ratio > 0.98].index.tolist()
if drop_almost:
    X_tr.drop(columns=drop_almost, inplace=True, errors="ignore")
    X_te.drop(columns=[c for c in drop_almost if c in X_te.columns], inplace=True, errors="ignore")
    num_cols = [c for c in num_cols if c not in drop_almost]
    cat_cols = [c for c in cat_cols if c not in drop_almost]
print("بعد التوحيد → رقمية:", len(num_cols), "فئوية:", len(cat_cols))

# preprocessor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_tf = Pipeline([("imputer", SimpleImputer(strategy="median"))])
cat_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordenc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])
prep = ColumnTransformer(
    [("num", num_tf, num_cols)] + ([("cat", cat_tf, cat_cols)] if cat_cols else []),
    remainder="drop"
)

prep.fit(X_tr)
X_tr_t = prep.transform(X_tr)
X_te_t = prep.transform(X_te)

# LightGBM,early stopping
from sklearn.utils.class_weight import compute_class_weight
import lightgbm as lgb
from lightgbm import LGBMClassifier

classes = np.unique(y_tr)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr)
class_to_w = dict(zip(classes, cw))
sample_weight = np.array([class_to_w[v] for v in y_tr])

lgb_bin = LGBMClassifier(
    objective="binary",
    n_estimators=5000,        #early stopping
    learning_rate=0.05,
    num_leaves=63,
    max_depth=-1,
    min_child_samples=50,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1
)

lgb_bin.fit(
    X_tr_t, y_tr,
    sample_weight=sample_weight,
    eval_set=[(X_te_t, y_te)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(150, verbose=False)]
)

from sklearn.metrics import classification_report, confusion_matrix, f1_score
y_pred = lgb_bin.predict(X_te_t)
cm = confusion_matrix(y_te, y_pred)
report = classification_report(y_te, y_pred, digits=4, target_names=["Normal","Attack"])
macro_f1 = f1_score(y_te, y_pred, average="macro")

tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn + 1e-9)
recall_attack = tp / (tp + fn + 1e-9)

print(report)
print("Confusion matrix:\n", cm)
print(f"Macro-F1={macro_f1:.4f} | FPR(Normal)={fpr:.4f} | Recall(Attack)={recall_attack:.4f}")

from pathlib import Path
import joblib, json
art = Path("/content/artifacts"); art.mkdir(exist_ok=True)
joblib.dump(prep, art/"preprocessor_stage1.joblib")
joblib.dump(lgb_bin, art/"stage1_lgbm_binary.joblib")
with open(art/"feature_list_stage1.json","w") as f: json.dump(list(X_tr.columns), f, indent=2)
print("Saved to:", art)

بعد التوحيد → رقمية: 30 فئوية: 0




[LightGBM] [Info] Number of positive: 83960, number of negative: 378982
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2001
[LightGBM] [Info] Number of data points in the train set: 462942, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




              precision    recall  f1-score   support

      Normal     0.9996    0.9988    0.9992     94746
      Attack     0.9946    0.9983    0.9965     20990

    accuracy                         0.9987    115736
   macro avg     0.9971    0.9986    0.9978    115736
weighted avg     0.9987    0.9987    0.9987    115736

Confusion matrix:
 [[94633   113]
 [   35 20955]]
Macro-F1=0.9978 | FPR(Normal)=0.0012 | Recall(Attack)=0.9983
Saved to: /content/artifacts
