In [5]:
import pandas as pd

sample = "D:\Canada\Subjects\Semester -2\AIDI-2005-02 CAPSTONE TERM ll\Bot_detector\data\processed\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet"
df = pd.read_parquet(sample, engine="pyarrow")
print(f"Loaded {len(df):,} rows × {df.shape[1]} columns for {sample}")


Loaded 1,048,575 rows × 80 columns for D:\Canada\Subjects\Semester -2\AIDI-2005-02 CAPSTONE TERM ll\Bot_detector\data\processed\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet


Step 1: Exploratory Data Analysis (EDA)
Goals:

Understand how many classes are present in each file.

Identify which files contain both Benign and attack labels.

See if there’s class imbalance.



In [6]:
import pandas as pd
import os

data_dir = "D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/processed"
parquet_files = [f for f in os.listdir(data_dir) if f.endswith(".parquet")]

for pf in parquet_files:
    df = pd.read_parquet(os.path.join(data_dir, pf))
    print(f"File: {pf}")
    print(df['Label'].value_counts())
    print("-" * 40)


File: Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet
Label
Benign    762384
Attack    286191
Name: count, dtype: int64
----------------------------------------
File: Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet
Label
Attack    601803
Benign    446772
Name: count, dtype: int64
----------------------------------------
File: Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet
Label
Benign    1048009
Attack        566
Name: count, dtype: int64
----------------------------------------
File: Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet
Label
Benign    7372557
Attack     576191
Name: count, dtype: int64
----------------------------------------
File: Thursday-01-03-2018_TrafficForML_CICFlowMeter.parquet
Label
Benign    238037
Attack     93088
Name: count, dtype: int64
----------------------------------------
File: Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet
Label
Benign    996077
Attack     52498
Name: count, dtype: int64
----------------------------------------


In [None]:
import os
import pandas as pd

# Path to Parquet files
parquet_dir = r"D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/processed"
parquet_files = [f for f in os.listdir(parquet_dir) if f.endswith(".parquet")]

print(f"Found {len(parquet_files)} Parquet files.\n")

for pf in sorted(parquet_files):
    fpath = os.path.join(parquet_dir, pf)
    df = pd.read_parquet(fpath)
    print(f"File: {pf}")
    print(f"  Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"  Label counts:\n{df['Label'].value_counts(dropna=False)}")
    missing = df.isnull().sum()
    n_missing = missing[missing > 0]
    if not n_missing.empty:
        print(f"  Columns with missing values:\n{n_missing}")
    else:
        print("  No missing values found.")
    print("-" * 60)


Found 10 Parquet files.

File: Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet
  Shape: 1,048,575 rows × 80 columns
  Label counts:
Label
Benign    762384
Bot       286191
Name: count, dtype: int64
  Columns with missing values:
Flow Byts/s    2558
dtype: int64
------------------------------------------------------------
File: Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet
  Shape: 1,048,575 rows × 80 columns
  Label counts:
Label
DoS attacks-Hulk            461912
Benign                      446772
DoS attacks-SlowHTTPTest    139890
Label                            1
Name: count, dtype: int64
  Columns with missing values:
Dst Port         1
Protocol         1
Flow Duration    1
Tot Fwd Pkts     1
Tot Bwd Pkts     1
                ..
Active Min       1
Idle Mean        1
Idle Std         1
Idle Max         1
Idle Min         1
Length: 78, dtype: int64
------------------------------------------------------------
File: Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet
  Shap

In [None]:
import os
import pandas as pd
import numpy as np

# Paths
PARQUET_DIR = 'D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/processed'
CLEANED_DIR = 'D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned'
os.makedirs(CLEANED_DIR, exist_ok=True)

# Columns to always drop if present
drop_cols = ['Timestamp', 'Flow ID', 'Src IP', 'Dst IP', 'Src Port', 'Label']  # 'Label' dropped after relabeling

# Harmonized set of feature columns 
feature_cols = None

for fname in sorted(os.listdir(PARQUET_DIR)):
    if not fname.endswith('.parquet'):
        continue
    fpath = os.path.join(PARQUET_DIR, fname)
    print(f"Processing {fname}...")
    df = pd.read_parquet(fpath)
    
    # 1. Remove rows 
    df = df[df['Label'].notna()]
    df = df[df['Label'] != 'Label']
    
    # 2. Standardize labels: Benign=0, attack=1
    df['Label'] = df['Label'].apply(lambda x: 0 if str(x).strip().lower() == 'benign' else 1)
    
    # 3. Drop unneeded columns (keep 'Label')
    cols_to_drop = [col for col in drop_cols if col != 'Label' and col in df.columns]
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop, errors='ignore')
    
    # 4. Set/verify harmonized columns
    if feature_cols is None:
        feature_cols = [col for col in df.columns if col != 'Label']
    else:
        missing = [c for c in feature_cols if c not in df.columns]
        extra = [c for c in df.columns if c not in feature_cols and c != 'Label']
        # Add missing with NaN, drop extras
        if missing:
            for m in missing:
                df[m] = np.nan
        if extra:
            df = df.drop(columns=extra, errors='ignore')
        df = df[feature_cols + ['Label']]
    
    # 5. Convert all feature columns to float
    for col in feature_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # 6. Fill missing values
    df[feature_cols] = df[feature_cols].fillna(df[feature_cols].median())
    
    # 7. Save cleaned parquet
    out_path = os.path.join(CLEANED_DIR, fname)
    df.to_parquet(out_path, index=False)
    print(f"Saved cleaned file: {out_path} (Rows: {len(df)})")


print("All files cleaned and harmonized for binary classification!")


Processing Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet...
Saved cleaned file: D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned\Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet (Rows: 1048575)
Processing Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet...
Saved cleaned file: D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet (Rows: 1048574)
Processing Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet...
Saved cleaned file: D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned\Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet (Rows: 1048575)
Processing Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet...
Saved cleaned file: D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned\Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet (Rows: 7948748)
Processing Thursday-01-03-2018_T

In [24]:
import pandas as pd
import glob

cleaned_dir = "D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned"
file_list = sorted(glob.glob(f"{cleaned_dir}/*.parquet"))

# Aggregate for overall class balance and missingness
total_counts = pd.Series(dtype=int)
total_missing = pd.Series(dtype=int)

for path in file_list:
    df = pd.read_parquet(path)
    print(f"{path.split('/')[-1]} - Label counts:\n{df['Label'].value_counts()}\n")
    total_counts = total_counts.add(df['Label'].value_counts(), fill_value=0)
    missing = df.isnull().sum()
    print(f"Missing values:\n{missing[missing>0]}\n")
    total_missing = total_missing.add(missing, fill_value=0)

print("=== OVERALL LABEL COUNTS ===")
print(total_counts)
print("\n=== OVERALL MISSING VALUES (any feature) ===")
print(total_missing[total_missing > 0])


cleaned\Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet - Label counts:
Label
0    762384
1    286191
Name: count, dtype: int64

Missing values:
Series([], dtype: int64)

cleaned\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet - Label counts:
Label
1    601802
0    446772
Name: count, dtype: int64

Missing values:
Series([], dtype: int64)

cleaned\Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet - Label counts:
Label
0    1048009
1        566
Name: count, dtype: int64

Missing values:
Series([], dtype: int64)

cleaned\Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet - Label counts:
Label
0    7372557
1     576191
Name: count, dtype: int64

Missing values:
Series([], dtype: int64)

cleaned\Thursday-01-03-2018_TrafficForML_CICFlowMeter.parquet - Label counts:
Label
0    238037
1     93063
Name: count, dtype: int64

Missing values:
Series([], dtype: int64)

cleaned\Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet - Label counts:
Label
0    996077
1     52498
Name: cou

In [None]:
import pyarrow.parquet as pq
import lightgbm as lgb
import numpy as np
import os
from glob import glob
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# === CONFIGURATION ===
CLEANED_DIR = r"D:\Canada\Subjects\Semester -2\AIDI-2005-02 CAPSTONE TERM ll\Bot_detector\data\cleaned"
MODEL_PATH = "final_lightgbm.txt"
BATCH_SIZE = 100_000

# 1. Auto-detect all cleaned parquet files, split train/test (last one as test for demo)
file_list = sorted(glob(os.path.join(CLEANED_DIR, "*.parquet")))
train_files = file_list[:-1]
test_file = file_list[-1]

print("Train files:", train_files)
print("Test file:", test_file)

# === CALCULATE scale_pos_weight ===
print("\nCalculating scale_pos_weight...")
total_0, total_1 = 0, 0
for f in train_files:
    parquet = pq.ParquetFile(f)
    for batch in parquet.iter_batches(batch_size=BATCH_SIZE, columns=['Label']):
        arr = batch.to_pandas()['Label'].value_counts()
        total_0 += arr.get(0, 0)
        total_1 += arr.get(1, 0)

scale_pos_weight = total_0 / total_1
print(f"scale_pos_weight: {scale_pos_weight:.3f}")

# 2. Pick feature columns (from any file)
sample = pq.read_table(train_files[0], columns=None).to_pandas().iloc[:1]
feature_cols = [col for col in sample.columns if col != 'Label']

# 3. LightGBM parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'scale_pos_weight': scale_pos_weight
}

# === TRAINING ===
booster = None
for file_path in train_files:
    print(f"\nProcessing: {file_path}")
    parquet_file = pq.ParquetFile(file_path)
    batch_idx = 1
    for batch in parquet_file.iter_batches(batch_size=BATCH_SIZE):
        df = batch.to_pandas()
        X = df[feature_cols].astype(np.float32)
        y = df['Label'].astype(np.int32)
        lgb_train = lgb.Dataset(X, y, free_raw_data=True)
        if booster is None:
            print("  Training new model …")
            booster = lgb.train(params, lgb_train, num_boost_round=100)
        else:
            print(f"  Continuing training … (batch {batch_idx})")
            booster = lgb.train(params, lgb_train, num_boost_round=100, init_model=booster)
        del df, X, y, lgb_train  # Free memory
        batch_idx += 1

booster.save_model(MODEL_PATH)
print("\nModel saved as:", MODEL_PATH)

# === EVALUATION ===
print(f"\nEvaluating on: {test_file}")
test_parquet = pq.ParquetFile(test_file)
y_true_all, y_pred_prob = [], []
for batch in test_parquet.iter_batches(batch_size=BATCH_SIZE):
    df = batch.to_pandas()
    X = df[feature_cols].astype(np.float32)
    y = df['Label'].astype(np.int32)
    y_prob = booster.predict(X)
    y_true_all.extend(y.tolist())
    y_pred_prob.extend(y_prob.tolist())
    del df, X, y, y_prob

# --- Threshold sweep
thresholds = [0.5, 0.4, 0.3, 0.2]
for threshold in thresholds:
    y_pred = (np.array(y_pred_prob) > threshold).astype(int)
    print(f"\n=== Results for threshold={threshold} ===")
    print("Accuracy:", accuracy_score(y_true_all, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true_all, y_pred))
    print("Classification Report:\n", classification_report(y_true_all, y_pred, digits=4))

print("\nDone! The model and thresholded evaluation results are printed.")


Train files: ['D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thursday-01-03-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\

### Updated Lightbgm

In [7]:
import pyarrow.parquet as pq
import lightgbm as lgb
import numpy as np
import os
from glob import glob
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# === CONFIGURATION ===
CLEANED_DIR = r"D:\Canada\Subjects\Semester -2\AIDI-2005-02 CAPSTONE TERM ll\Bot_detector\data\cleaned"
MODEL_PATH = "final_lightgbm.txt"
BATCH_SIZE = 100_000

file_list = sorted(glob(os.path.join(CLEANED_DIR, "*.parquet")))
if len(file_list) < 3:
    raise Exception("Need at least 3 files for train/val/test split!")
train_files = file_list[:-2]
val_file = file_list[-2]
test_file = file_list[-1]

print("Train files:", train_files)
print("Validation file:", val_file)
print("Test file:", test_file)

print("\nCalculating scale_pos_weight from train files...")
total_0, total_1 = 0, 0
for f in train_files:
    parquet = pq.ParquetFile(f)
    for batch in parquet.iter_batches(batch_size=BATCH_SIZE, columns=['Label']):
        arr = batch.to_pandas()['Label'].value_counts()
        total_0 += arr.get(0, 0)
        total_1 += arr.get(1, 0)

scale_pos_weight = total_0 / total_1
print(f"scale_pos_weight: {scale_pos_weight:.3f}")

sample = pq.read_table(train_files[0], columns=None).to_pandas().iloc[:1]
feature_cols = [col for col in sample.columns if col != 'Label']

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'learning_rate': 0.02,
    'num_leaves': 64,
    'max_depth': 7,
    'min_child_samples': 40,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': scale_pos_weight,
    'verbose': -1
}

print("\nStarting batch-wise training with early stopping...")
val_parquet = pq.ParquetFile(val_file)
val_batch = next(val_parquet.iter_batches(batch_size=BATCH_SIZE))
df_val = val_batch.to_pandas()
X_val = df_val[feature_cols].astype(np.float32)
y_val = df_val['Label'].astype(np.int32)
lgb_val = lgb.Dataset(X_val, y_val, free_raw_data=False)  # <--- Fix here

booster = None
for file_path in train_files:
    print(f"\nProcessing: {file_path}")
    parquet_file = pq.ParquetFile(file_path)
    batch_idx = 1
    for batch in parquet_file.iter_batches(batch_size=BATCH_SIZE):
        df = batch.to_pandas()
        X = df[feature_cols].astype(np.float32)
        y = df['Label'].astype(np.int32)
        lgb_train = lgb.Dataset(X, y, free_raw_data=False)  # <--- Fix here
        if booster is None:
            print("  Training new model with early stopping …")
            booster = lgb.train(
                params,
                lgb_train,
                num_boost_round=500,
                valid_sets=[lgb_val],
                valid_names=['validation'],
                callbacks=[lgb.early_stopping(25), lgb.log_evaluation(25)]
            )
        else:
            print(f"  Continuing training … (batch {batch_idx})")
            booster = lgb.train(
                params,
                lgb_train,
                num_boost_round=500,
                init_model=booster,
                valid_sets=[lgb_val],
                valid_names=['validation'],
                callbacks=[lgb.early_stopping(25), lgb.log_evaluation(25)]
            )
        del df, X, y, lgb_train
        batch_idx += 1

booster.save_model(MODEL_PATH)
print("\nModel saved as:", MODEL_PATH)

print(f"\nEvaluating on: {test_file}")
test_parquet = pq.ParquetFile(test_file)
y_true_all, y_pred_prob = [], []
for batch in test_parquet.iter_batches(batch_size=BATCH_SIZE):
    df = batch.to_pandas()
    X = df[feature_cols].astype(np.float32)
    y = df['Label'].astype(np.int32)
    y_prob = booster.predict(X)
    y_true_all.extend(y.tolist())
    y_pred_prob.extend(y_prob.tolist())
    del df, X, y, y_prob

thresholds = [0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01]
for threshold in thresholds:
    y_pred = (np.array(y_pred_prob) > threshold).astype(int)
    print(f"\n=== Results for threshold={threshold} ===")
    print("Accuracy:", accuracy_score(y_true_all, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true_all, y_pred))
    print("Classification Report:\n", classification_report(y_true_all, y_pred, digits=4))

print("\nDone! The model and thresholded evaluation results are printed.")


Train files: ['D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thursday-01-03-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\

### Using optuna (Tuning)

In [8]:
import optuna
import pyarrow.parquet as pq
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Path to one medium/large cleaned file
tune_file = r"D:\Canada\Subjects\Semester -2\AIDI-2005-02 CAPSTONE TERM ll\Bot_detector\data\cleaned\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet"
BATCH_SIZE = 100_000

# Load a batch
parquet = pq.ParquetFile(tune_file)
for batch in parquet.iter_batches(batch_size=BATCH_SIZE):
    df = batch.to_pandas()
    break  # Take first batch

feature_cols = [c for c in df.columns if c != "Label"]
X = df[feature_cols].astype(np.float32)
y = df["Label"].astype(np.int32)

# Split for tuning
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

def objective(trial):
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 15, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
        'verbose': -1,
    }
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    booster = lgb.train(
        params, lgb_train,
        valid_sets=[lgb_val],
        valid_names=['validation'],
        num_boost_round=300,
        callbacks=[lgb.early_stopping(25), lgb.log_evaluation(0)]
    )
    # Use lower threshold to favor recall
    y_pred = (booster.predict(X_val) > 0.2).astype(int)
    return f1_score(y_val, y_pred, average='binary')

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)  # 40-50 for reasonable coverage

best_params = study.best_params
print("Best hyperparameters:", best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-05 19:13:56,685] A new study created in memory with name: no-name-663e31c4-3b36-4269-b4c4-34b56448bee2
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value


Training until validation scores don't improve for 25 rounds


[I 2025-06-05 19:13:57,569] Trial 0 finished with value: 0.9999164508313142 and parameters: {'learning_rate': 0.04560353410164083, 'num_leaves': 101, 'max_depth': 20, 'min_child_samples': 95, 'subsample': 0.7884208766149358, 'colsample_bytree': 0.6499557736337581, 'scale_pos_weight': 4.303589271823835}. Best is trial 0 with value: 0.9999164508313142.


Early stopping, best iteration is:
[58]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:13:57,996] Trial 1 finished with value: 0.9999331595481585 and parameters: {'learning_rate': 0.03598716260032116, 'num_leaves': 88, 'max_depth': 10, 'min_child_samples': 90, 'subsample': 0.8579528415284592, 'colsample_bytree': 0.7069807607708449, 'scale_pos_weight': 5.7485223337623985}. Best is trial 1 with value: 0.9999331595481585.


Early stopping, best iteration is:
[83]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:13:58,295] Trial 2 finished with value: 0.9986648865153538 and parameters: {'learning_rate': 0.013645793457541272, 'num_leaves': 150, 'max_depth': 12, 'min_child_samples': 36, 'subsample': 0.88420026359968, 'colsample_bytree': 0.8609711963632429, 'scale_pos_weight': 4.0334154974828405}. Best is trial 1 with value: 0.9999331595481585.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-

Early stopping, best iteration is:
[39]	validation's auc: 0.999999
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[18]	validation's auc: 1


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value


Training until validation scores don't improve for 25 rounds


[I 2025-06-05 19:13:58,826] Trial 4 finished with value: 0.9996992883156804 and parameters: {'learning_rate': 0.020227367431839005, 'num_leaves': 87, 'max_depth': 10, 'min_child_samples': 82, 'subsample': 0.8558035135929238, 'colsample_bytree': 0.7404733235392386, 'scale_pos_weight': 5.8966306491353535}. Best is trial 1 with value: 0.9999331595481585.


Early stopping, best iteration is:
[68]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:13:59,302] Trial 5 finished with value: 0.9986982209018993 and parameters: {'learning_rate': 0.015784044450559406, 'num_leaves': 66, 'max_depth': 20, 'min_child_samples': 65, 'subsample': 0.8540364155147294, 'colsample_bytree': 0.7271861574722229, 'scale_pos_weight': 6.870844133607379}. Best is trial 1 with value: 0.9999331595481585.


Early stopping, best iteration is:
[38]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:13:59,541] Trial 6 finished with value: 0.9997994719427826 and parameters: {'learning_rate': 0.13189462139129365, 'num_leaves': 126, 'max_depth': 20, 'min_child_samples': 38, 'subsample': 0.9444090562742982, 'colsample_bytree': 0.723209894832436, 'scale_pos_weight': 4.517621677991876}. Best is trial 1 with value: 0.9999331595481585.


Early stopping, best iteration is:
[8]	validation's auc: 0.999887
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:13:59,887] Trial 7 finished with value: 0.9999665764230088 and parameters: {'learning_rate': 0.18512850854627738, 'num_leaves': 15, 'max_depth': 17, 'min_child_samples': 93, 'subsample': 0.6704129072515996, 'colsample_bytree': 0.68396826250353, 'scale_pos_weight': 6.469073069531305}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[89]	validation's auc: 0.999951
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:00,152] Trial 8 finished with value: 0.9986815534304645 and parameters: {'learning_rate': 0.013819218915470395, 'num_leaves': 104, 'max_depth': 7, 'min_child_samples': 79, 'subsample': 0.7533288836864305, 'colsample_bytree': 0.6935952592019199, 'scale_pos_weight': 5.00130406488552}. Best is trial 7 with value: 0.9999665764230088.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-0

Early stopping, best iteration is:
[41]	validation's auc: 1
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[1]	validation's auc: 0.993362


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:00,512] Trial 10 finished with value: 0.9999164508313142 and parameters: {'learning_rate': 0.194930350411836, 'num_leaves': 16, 'max_depth': 16, 'min_child_samples': 47, 'subsample': 0.6218558885532086, 'colsample_bytree': 0.6063192725282336, 'scale_pos_weight': 6.946399595398768}. Best is trial 7 with value: 0.9999665764230088.


Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[7]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:00,733] Trial 11 finished with value: 0.9998496215475614 and parameters: {'learning_rate': 0.03531696764642827, 'num_leaves': 52, 'max_depth': 3, 'min_child_samples': 97, 'subsample': 0.6786593646845375, 'colsample_bytree': 0.8090468987933609, 'scale_pos_weight': 6.1204371586768795}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[48]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:01,049] Trial 12 finished with value: 0.9998830350727689 and parameters: {'learning_rate': 0.05302827772792345, 'num_leaves': 45, 'max_depth': 14, 'min_child_samples': 56, 'subsample': 0.7125470926660693, 'colsample_bytree': 0.9993948500473785, 'scale_pos_weight': 5.843996815691067}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[29]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:01,296] Trial 13 finished with value: 0.9997326917936381 and parameters: {'learning_rate': 0.06469817836933513, 'num_leaves': 22, 'max_depth': 16, 'min_child_samples': 25, 'subsample': 0.9933868916389614, 'colsample_bytree': 0.7903702347898184, 'scale_pos_weight': 6.5738343181317}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[8]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:01,683] Trial 14 finished with value: 0.9996992883156804 and parameters: {'learning_rate': 0.02868031726099514, 'num_leaves': 92, 'max_depth': 16, 'min_child_samples': 80, 'subsample': 0.6284515164881406, 'colsample_bytree': 0.6006923532788192, 'scale_pos_weight': 7.539105060147962}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[44]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:02,130] Trial 15 finished with value: 0.9999498654723508 and parameters: {'learning_rate': 0.08401670545588726, 'num_leaves': 118, 'max_depth': 13, 'min_child_samples': 100, 'subsample': 0.6878434514082071, 'colsample_bytree': 0.6530972972657778, 'scale_pos_weight': 5.546439616483521}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[61]	validation's auc: 0.999934
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:02,515] Trial 16 finished with value: 0.9999498637966475 and parameters: {'learning_rate': 0.08598471164808877, 'num_leaves': 123, 'max_depth': 13, 'min_child_samples': 100, 'subsample': 0.681010992050926, 'colsample_bytree': 0.6530794371262499, 'scale_pos_weight': 5.285789422483324}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[49]	validation's auc: 0.999934
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:02,940] Trial 17 finished with value: 0.9999665764230088 and parameters: {'learning_rate': 0.1957056175784135, 'num_leaves': 121, 'max_depth': 18, 'min_child_samples': 71, 'subsample': 0.6703481987655344, 'colsample_bytree': 0.7810196874715913, 'scale_pos_weight': 6.3768172374029986}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[69]	validation's auc: 0.999974
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:03,253] Trial 18 finished with value: 0.9999498688234213 and parameters: {'learning_rate': 0.1963032129064966, 'num_leaves': 138, 'max_depth': 18, 'min_child_samples': 69, 'subsample': 0.6061656874723715, 'colsample_bytree': 0.7915081203709389, 'scale_pos_weight': 6.310399620775555}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[14]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:03,539] Trial 19 finished with value: 0.9996992883156804 and parameters: {'learning_rate': 0.13626307139994254, 'num_leaves': 63, 'max_depth': 17, 'min_child_samples': 61, 'subsample': 0.7272972894907207, 'colsample_bytree': 0.8367649838454011, 'scale_pos_weight': 7.133609423579998}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[3]	validation's auc: 0.99995
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:03,850] Trial 20 finished with value: 0.9999164508313142 and parameters: {'learning_rate': 0.13646736624320266, 'num_leaves': 112, 'max_depth': 18, 'min_child_samples': 53, 'subsample': 0.6546215238551097, 'colsample_bytree': 0.7639856888054799, 'scale_pos_weight': 6.377531195783911}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[12]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:04,291] Trial 21 finished with value: 0.9999665764230088 and parameters: {'learning_rate': 0.19447904818602701, 'num_leaves': 150, 'max_depth': 18, 'min_child_samples': 72, 'subsample': 0.6018453830745306, 'colsample_bytree': 0.8768160699394809, 'scale_pos_weight': 6.321212530484546}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[69]	validation's auc: 0.999975
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:04,544] Trial 22 finished with value: 0.9996156866676692 and parameters: {'learning_rate': 0.17075053127649106, 'num_leaves': 150, 'max_depth': 18, 'min_child_samples': 75, 'subsample': 0.6481728030618982, 'colsample_bytree': 0.8907592871910307, 'scale_pos_weight': 6.666068821937084}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[4]	validation's auc: 0.99975
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:04,817] Trial 23 finished with value: 0.9999331595481585 and parameters: {'learning_rate': 0.10973009867990309, 'num_leaves': 134, 'max_depth': 15, 'min_child_samples': 82, 'subsample': 0.6033285052166177, 'colsample_bytree': 0.931801080268536, 'scale_pos_weight': 7.284828176237712}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[22]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:05,289] Trial 24 finished with value: 0.9999665764230088 and parameters: {'learning_rate': 0.15926555275156568, 'num_leaves': 142, 'max_depth': 19, 'min_child_samples': 89, 'subsample': 0.7505209010601501, 'colsample_bytree': 0.8488433544920642, 'scale_pos_weight': 6.185883483730522}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[108]	validation's auc: 0.999953
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:05,592] Trial 25 finished with value: 0.9996825874137557 and parameters: {'learning_rate': 0.07106649249071058, 'num_leaves': 130, 'max_depth': 15, 'min_child_samples': 66, 'subsample': 0.6561971426361681, 'colsample_bytree': 0.9712683098876507, 'scale_pos_weight': 6.607001997348869}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[10]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:05,892] Trial 26 finished with value: 0.9996491286695294 and parameters: {'learning_rate': 0.10503330706273284, 'num_leaves': 112, 'max_depth': 17, 'min_child_samples': 48, 'subsample': 0.7173501524293386, 'colsample_bytree': 0.885094154476296, 'scale_pos_weight': 5.4976859155765}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[1]	validation's auc: 0.993576
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:06,136] Trial 27 finished with value: 0.99968255559454 and parameters: {'learning_rate': 0.1509056349893212, 'num_leaves': 78, 'max_depth': 19, 'min_child_samples': 74, 'subsample': 0.6398400736593701, 'colsample_bytree': 0.7592323806750807, 'scale_pos_weight': 7.43753171053948}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[1]	validation's auc: 0.993456
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:06,467] Trial 28 finished with value: 0.9998997426728603 and parameters: {'learning_rate': 0.197332540632944, 'num_leaves': 99, 'max_depth': 17, 'min_child_samples': 61, 'subsample': 0.7769349718886936, 'colsample_bytree': 0.8214045053779897, 'scale_pos_weight': 7.763969549339474}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[3]	validation's auc: 0.999937
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:07,210] Trial 29 finished with value: 0.9987982374148752 and parameters: {'learning_rate': 0.010511745854879226, 'num_leaves': 31, 'max_depth': 20, 'min_child_samples': 93, 'subsample': 0.8092883014353324, 'colsample_bytree': 0.6824945011005659, 'scale_pos_weight': 7.031970981215996}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[114]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:07,778] Trial 30 finished with value: 0.9999498654723508 and parameters: {'learning_rate': 0.12769661047135278, 'num_leaves': 139, 'max_depth': 14, 'min_child_samples': 86, 'subsample': 0.686014004961469, 'colsample_bytree': 0.7687573431399413, 'scale_pos_weight': 6.365870953807143}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[128]	validation's auc: 0.999999
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:08,054] Trial 31 finished with value: 0.9996324389347412 and parameters: {'learning_rate': 0.16721045404004417, 'num_leaves': 143, 'max_depth': 19, 'min_child_samples': 86, 'subsample': 0.7519852717606653, 'colsample_bytree': 0.8498021371013151, 'scale_pos_weight': 6.138654152669172}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[2]	validation's auc: 0.993504
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:08,307] Trial 32 finished with value: 0.9998161733986196 and parameters: {'learning_rate': 0.16106549029124173, 'num_leaves': 144, 'max_depth': 19, 'min_child_samples': 94, 'subsample': 0.7467603049142348, 'colsample_bytree': 0.8856975275700878, 'scale_pos_weight': 6.74142976111939}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[6]	validation's auc: 0.99989
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:08,519] Trial 33 finished with value: 0.9996491521176176 and parameters: {'learning_rate': 0.12178501231361882, 'num_leaves': 131, 'max_depth': 18, 'min_child_samples': 89, 'subsample': 0.7050642851596128, 'colsample_bytree': 0.8654243190181943, 'scale_pos_weight': 6.155585115704424}. Best is trial 7 with value: 0.9999665764230088.


Early stopping, best iteration is:
[2]	validation's auc: 0.99339
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:08,905] Trial 34 finished with value: 0.9999665786571305 and parameters: {'learning_rate': 0.1614458587407259, 'num_leaves': 150, 'max_depth': 19, 'min_child_samples': 74, 'subsample': 0.6628764303318666, 'colsample_bytree': 0.9327214397233908, 'scale_pos_weight': 5.72295277674116}. Best is trial 34 with value: 0.9999665786571305.


Early stopping, best iteration is:
[38]	validation's auc: 1
Training until validation scores don't improve for 25 rounds


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:09,144] Trial 35 finished with value: 0.9996992682193337 and parameters: {'learning_rate': 0.09088644044946317, 'num_leaves': 150, 'max_depth': 15, 'min_child_samples': 76, 'subsample': 0.6655900596422136, 'colsample_bytree': 0.9344274335033163, 'scale_pos_weight': 5.621936817148931}. Best is trial 34 with value: 0.9999665786571305.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-0

Early stopping, best iteration is:
[3]	validation's auc: 0.999865
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[2]	validation's auc: 1


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:09,589] Trial 37 finished with value: 0.9997159897756319 and parameters: {'learning_rate': 0.02351311966529708, 'num_leaves': 94, 'max_depth': 20, 'min_child_samples': 66, 'subsample': 0.6072364686078291, 'colsample_bytree': 0.9548067335095933, 'scale_pos_weight': 5.932540106093068}. Best is trial 34 with value: 0.9999665786571305.


Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[14]	validation's auc: 1


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:09,857] Trial 38 finished with value: 0.9998663190963171 and parameters: {'learning_rate': 0.14600207367005733, 'num_leaves': 125, 'max_depth': 17, 'min_child_samples': 72, 'subsample': 0.6356057960239543, 'colsample_bytree': 0.9688768188677415, 'scale_pos_weight': 6.526968850189363}. Best is trial 34 with value: 0.9999665786571305.


Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[11]	validation's auc: 0.999951


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 4.0, 8.0),  # Use around your computed value
[I 2025-06-05 19:14:10,128] Trial 39 finished with value: 0.9994986798569566 and parameters: {'learning_rate': 0.12061736687467982, 'num_leaves': 49, 'max_depth': 19, 'min_child_samples': 84, 'subsample': 0.7826367590538567, 'colsample_bytree': 0.6302860479454693, 'scale_pos_weight': 5.7293922298089885}. Best is trial 34 with value: 0.9999665786571305.


Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[4]	validation's auc: 0.999578
Best hyperparameters: {'learning_rate': 0.1614458587407259, 'num_leaves': 150, 'max_depth': 19, 'min_child_samples': 74, 'subsample': 0.6628764303318666, 'colsample_bytree': 0.9327214397233908, 'scale_pos_weight': 5.72295277674116}


In [1]:
# try

import pyarrow.parquet as pq
import lightgbm as lgb
import numpy as np
import os
from glob import glob
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# === CONFIGURATION ===
CLEANED_DIR = r"D:\Canada\Subjects\Semester -2\AIDI-2005-02 CAPSTONE TERM ll\Bot_detector\data\cleaned"
MODEL_PATH = "final_lightgbm_tuned.txt"
BATCH_SIZE = 100_000  # Tune for your RAM

# 1. Auto-detect all cleaned parquet files, split train/test (last one as test for demo)
file_list = sorted(glob(os.path.join(CLEANED_DIR, "*.parquet")))
train_files = file_list[:-1]
test_file = file_list[-1]

print("Train files:", train_files)
print("Test file:", test_file)

# 2. Pick feature columns (from any file)
sample = pq.read_table(train_files[0], columns=None).to_pandas().head(1)
feature_cols = [col for col in sample.columns if col != 'Label']

# 3. Compute global scale_pos_weight (class 0 / class 1)
total_0 = 0
total_1 = 0
for f in train_files:
    df = pq.read_table(f, columns=['Label']).to_pandas()
    total_0 += (df['Label'] == 0).sum()
    total_1 += (df['Label'] == 1).sum()
scale_pos_weight = round(total_0 / total_1, 2)
print(f"scale_pos_weight: {scale_pos_weight}")

# 4. LightGBM parameters (tuned)
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',
    'learning_rate': 0.03,      # Slower learning for better results
    'num_leaves': 40,           # Try between 32-64 for large data
    'max_depth': 10,            # Max tree depth
    'scale_pos_weight': scale_pos_weight,  # Handle imbalance
    'feature_fraction': 0.8,    # Randomly select features at each tree
    'bagging_fraction': 0.8,    # Randomly select data at each iteration
    'bagging_freq': 2,          # Perform bagging every 2 iterations
    'min_child_samples': 30,    # Minimum samples per leaf
    'verbose': -1,
    'n_jobs': -1
}

# === TRAINING ===
booster = None
for file_idx, file_path in enumerate(train_files):
    print(f"\nProcessing: {file_path}")
    parquet_file = pq.ParquetFile(file_path)
    batch_idx = 1
    for batch in parquet_file.iter_batches(batch_size=BATCH_SIZE):
        df = batch.to_pandas()
        X = df[feature_cols].astype(np.float32)
        y = df['Label'].astype(np.int32)
        lgb_train = lgb.Dataset(X, y, free_raw_data=True)
        if booster is None:
            print("  Training new model …")
            booster = lgb.train(
                params,
                lgb_train,
                num_boost_round=200,  # More boosting rounds
                valid_sets=[lgb_train],
                callbacks=[lgb.log_evaluation(period=25)],
                keep_training_booster=True
            )
        else:
            print(f"  Continuing training … (batch {batch_idx})")
            booster = lgb.train(
                params,
                lgb_train,
                num_boost_round=200,
                valid_sets=[lgb_train],
                init_model=booster,
                callbacks=[lgb.log_evaluation(period=25)],
                keep_training_booster=True
            )
        del df, X, y, lgb_train
        batch_idx += 1

# Save model
booster.save_model(MODEL_PATH)
print("\nModel saved as:", MODEL_PATH)

# === EVALUATION ===
print(f"\nEvaluating on: {test_file}")
test_parquet = pq.ParquetFile(test_file)
y_true_all, y_pred_all = [], []
for batch in test_parquet.iter_batches(batch_size=BATCH_SIZE):
    df = batch.to_pandas()
    X = df[feature_cols].astype(np.float32)
    y = df['Label'].astype(np.int32)
    y_pred_proba = booster.predict(X)
    y_pred = (y_pred_proba > 0.4).astype(int)  # Lower threshold to catch more attacks (try 0.4, 0.3, etc.)
    y_true_all.extend(y.tolist())
    y_pred_all.extend(y_pred.tolist())
    del df, X, y, y_pred

# Results
acc = accuracy_score(y_true_all, y_pred_all)
cm = confusion_matrix(y_true_all, y_pred_all)
cr = classification_report(y_true_all, y_pred_all, digits=4)

print("\n=== Test Results (threshold=0.4) ===")
print("Accuracy:", acc)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", cr)

with open("classification_report_lgbm_tuned.txt", "w") as f:
    f.write("Accuracy: {:.4f}\n".format(acc))
    f.write("Confusion Matrix:\n{}\n".format(cm))
    f.write("Classification Report:\n{}\n".format(cr))

print("\nDone! The tuned model and report are saved.")


Train files: ['D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thursday-01-03-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\data\\cleaned\\Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet', 'D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\

In [None]:
# Sample try

In [8]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import optuna

# File paths
data_path = r'D:\Canada\Subjects\Semester -2\AIDI-2005-02 CAPSTONE TERM ll\Bot_detector\data\cleaned\*.parquet'
all_files = glob.glob(data_path)

# Sample data for prototyping/tuning (10,000 rows from each file)
df_list = []
for file in all_files:
    df_temp = pd.read_parquet(file)
    if len(df_temp) > 10000:
        df_temp = df_temp.sample(n=10000, random_state=42)
    df_list.append(df_temp)
df = pd.concat(df_list, ignore_index=True)
print("Combined sampled shape:", df.shape)

# Convert labels to binary
df['Label'] = df['Label'].replace({'Benign': 0, 'Attack': 1}).astype(int)

# Select numeric columns for features
X = df.drop(columns=['Label']).select_dtypes(include=np.number)
y = df['Label']


Combined sampled shape: (100000, 79)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [11]:
def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "max_depth": trial.suggest_int("max_depth", 4, 16),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "scale_pos_weight": float(np.sum(y_train == 0)) / np.sum(y_train == 1),  # handle imbalance
    }
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_test, label=y_test)
    booster = lgb.train(
        params, dtrain, valid_sets=[dval],
        num_boost_round=200,
        callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(0)]  # disables logging, use 100 for every 100 rounds
    )

    preds = booster.predict(X_test)
    auc = roc_auc_score(y_test, preds)
    return auc

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
print("Best params:", study.best_params)


[I 2025-06-16 13:38:56,701] A new study created in memory with name: no-name-68c9f722-33ac-4d96-921c-b7c270b381fb


Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:38:59,079] Trial 0 finished with value: 0.9808502718898113 and parameters: {'learning_rate': 0.18139066159802691, 'num_leaves': 117, 'max_depth': 11, 'feature_fraction': 0.888821100941702, 'bagging_fraction': 0.5528380922327147, 'bagging_freq': 1, 'min_data_in_leaf': 75, 'lambda_l1': 3.911840621886989, 'lambda_l2': 0.20239210830115262}. Best is trial 0 with value: 0.9808502718898113.


Early stopping, best iteration is:
[49]	valid_0's auc: 0.98085
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:00,545] Trial 1 finished with value: 0.9818915813012655 and parameters: {'learning_rate': 0.1254389560847368, 'num_leaves': 78, 'max_depth': 16, 'feature_fraction': 0.6147783077776022, 'bagging_fraction': 0.9960620687306792, 'bagging_freq': 9, 'min_data_in_leaf': 45, 'lambda_l1': 1.0974257818029893, 'lambda_l2': 2.726243314051949}. Best is trial 1 with value: 0.9818915813012655.


Early stopping, best iteration is:
[52]	valid_0's auc: 0.981892
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:03,541] Trial 2 finished with value: 0.980452426154311 and parameters: {'learning_rate': 0.05466076915383957, 'num_leaves': 18, 'max_depth': 12, 'feature_fraction': 0.846676028403428, 'bagging_fraction': 0.9015152466567351, 'bagging_freq': 7, 'min_data_in_leaf': 86, 'lambda_l1': 1.3696256733101726, 'lambda_l2': 3.0034629514711986}. Best is trial 1 with value: 0.9818915813012655.


Did not meet early stopping. Best iteration is:
[198]	valid_0's auc: 0.980452
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:09,263] Trial 3 finished with value: 0.9814388892420678 and parameters: {'learning_rate': 0.12668107092644357, 'num_leaves': 59, 'max_depth': 5, 'feature_fraction': 0.9870233597019524, 'bagging_fraction': 0.7548974698893183, 'bagging_freq': 10, 'min_data_in_leaf': 32, 'lambda_l1': 3.1359638460104855, 'lambda_l2': 0.3205208710185481}. Best is trial 1 with value: 0.9818915813012655.


Did not meet early stopping. Best iteration is:
[189]	valid_0's auc: 0.981439
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:14,197] Trial 4 finished with value: 0.9818694087975073 and parameters: {'learning_rate': 0.1078538405843952, 'num_leaves': 21, 'max_depth': 5, 'feature_fraction': 0.9940261550880798, 'bagging_fraction': 0.9913178669289349, 'bagging_freq': 7, 'min_data_in_leaf': 38, 'lambda_l1': 1.631114735664284, 'lambda_l2': 1.6329226194145197}. Best is trial 1 with value: 0.9818915813012655.


Did not meet early stopping. Best iteration is:
[195]	valid_0's auc: 0.981869
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:16,065] Trial 5 finished with value: 0.9809459123172579 and parameters: {'learning_rate': 0.16003308428037108, 'num_leaves': 36, 'max_depth': 6, 'feature_fraction': 0.5094051389903242, 'bagging_fraction': 0.5143950968826672, 'bagging_freq': 3, 'min_data_in_leaf': 25, 'lambda_l1': 4.639828000819839, 'lambda_l2': 4.768410781061782}. Best is trial 1 with value: 0.9818915813012655.


Early stopping, best iteration is:
[114]	valid_0's auc: 0.980946
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:18,886] Trial 6 finished with value: 0.9819046838609043 and parameters: {'learning_rate': 0.08751227127022969, 'num_leaves': 55, 'max_depth': 7, 'feature_fraction': 0.8603448670727547, 'bagging_fraction': 0.8097910918458999, 'bagging_freq': 7, 'min_data_in_leaf': 68, 'lambda_l1': 2.0689076521749348, 'lambda_l2': 3.6283066508439275}. Best is trial 6 with value: 0.9819046838609043.


Early stopping, best iteration is:
[172]	valid_0's auc: 0.981905
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:21,638] Trial 7 finished with value: 0.9807949094463798 and parameters: {'learning_rate': 0.054974295728494986, 'num_leaves': 19, 'max_depth': 13, 'feature_fraction': 0.591544542789908, 'bagging_fraction': 0.9238747807187657, 'bagging_freq': 7, 'min_data_in_leaf': 25, 'lambda_l1': 4.110391783839177, 'lambda_l2': 0.20515033896329138}. Best is trial 6 with value: 0.9819046838609043.


Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.980795
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:24,337] Trial 8 finished with value: 0.9814138402309938 and parameters: {'learning_rate': 0.08087063771328176, 'num_leaves': 25, 'max_depth': 7, 'feature_fraction': 0.9033554719908522, 'bagging_fraction': 0.5446654421821717, 'bagging_freq': 1, 'min_data_in_leaf': 63, 'lambda_l1': 0.5109435388406186, 'lambda_l2': 0.5362095134170819}. Best is trial 6 with value: 0.9819046838609043.


Did not meet early stopping. Best iteration is:
[198]	valid_0's auc: 0.981414
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:26,354] Trial 9 finished with value: 0.9818822292116915 and parameters: {'learning_rate': 0.09068827614757424, 'num_leaves': 81, 'max_depth': 8, 'feature_fraction': 0.9935886052673013, 'bagging_fraction': 0.7781696225467769, 'bagging_freq': 1, 'min_data_in_leaf': 50, 'lambda_l1': 0.5497677115841698, 'lambda_l2': 0.023711671984743}. Best is trial 6 with value: 0.9819046838609043.


Early stopping, best iteration is:
[104]	valid_0's auc: 0.981882
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:29,445] Trial 10 finished with value: 0.9814143976403061 and parameters: {'learning_rate': 0.04016974756186335, 'num_leaves': 53, 'max_depth': 9, 'feature_fraction': 0.745867005510173, 'bagging_fraction': 0.6653527345969691, 'bagging_freq': 4, 'min_data_in_leaf': 100, 'lambda_l1': 2.317069532485327, 'lambda_l2': 4.166689598789}. Best is trial 6 with value: 0.9819046838609043.


Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.981414
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:30,385] Trial 11 finished with value: 0.9812583780856168 and parameters: {'learning_rate': 0.13524362692589378, 'num_leaves': 91, 'max_depth': 16, 'feature_fraction': 0.6989966658279754, 'bagging_fraction': 0.8453047489183971, 'bagging_freq': 10, 'min_data_in_leaf': 54, 'lambda_l1': 1.7801938860113524, 'lambda_l2': 3.028358879137993}. Best is trial 6 with value: 0.9819046838609043.


Early stopping, best iteration is:
[21]	valid_0's auc: 0.981258
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:34,422] Trial 12 finished with value: 0.9815831619116189 and parameters: {'learning_rate': 0.01314133070863198, 'num_leaves': 103, 'max_depth': 16, 'feature_fraction': 0.6480809795038489, 'bagging_fraction': 0.9985490025082345, 'bagging_freq': 8, 'min_data_in_leaf': 67, 'lambda_l1': 2.821643537277791, 'lambda_l2': 3.7851851940844465}. Best is trial 6 with value: 0.9819046838609043.


Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.981583
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:35,893] Trial 13 finished with value: 0.9814948847923723 and parameters: {'learning_rate': 0.14028223243614393, 'num_leaves': 65, 'max_depth': 14, 'feature_fraction': 0.8059392830237418, 'bagging_fraction': 0.6595867831233237, 'bagging_freq': 9, 'min_data_in_leaf': 43, 'lambda_l1': 0.05090283176281618, 'lambda_l2': 2.0886602769087586}. Best is trial 6 with value: 0.9819046838609043.


Early stopping, best iteration is:
[61]	valid_0's auc: 0.981495
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:37,689] Trial 14 finished with value: 0.9818583019008387 and parameters: {'learning_rate': 0.10180027374468578, 'num_leaves': 43, 'max_depth': 10, 'feature_fraction': 0.5746500912412393, 'bagging_fraction': 0.8362518558549568, 'bagging_freq': 5, 'min_data_in_leaf': 75, 'lambda_l1': 1.1302891059837463, 'lambda_l2': 3.3718893900268916}. Best is trial 6 with value: 0.9819046838609043.


Early stopping, best iteration is:
[94]	valid_0's auc: 0.981858
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:39,770] Trial 15 finished with value: 0.9808819616414584 and parameters: {'learning_rate': 0.19534877448122945, 'num_leaves': 83, 'max_depth': 4, 'feature_fraction': 0.7673787868465876, 'bagging_fraction': 0.6939004297014277, 'bagging_freq': 8, 'min_data_in_leaf': 50, 'lambda_l1': 2.1992242946068887, 'lambda_l2': 2.3499983055169436}. Best is trial 6 with value: 0.9819046838609043.


Early stopping, best iteration is:
[148]	valid_0's auc: 0.980882
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:42,099] Trial 16 finished with value: 0.9819280468808482 and parameters: {'learning_rate': 0.07516228803155299, 'num_leaves': 73, 'max_depth': 14, 'feature_fraction': 0.6970798704970366, 'bagging_fraction': 0.9320033396410377, 'bagging_freq': 6, 'min_data_in_leaf': 72, 'lambda_l1': 3.308784229473275, 'lambda_l2': 1.3698182046203986}. Best is trial 16 with value: 0.9819280468808482.


Early stopping, best iteration is:
[105]	valid_0's auc: 0.981928
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:44,845] Trial 17 finished with value: 0.9821886460567716 and parameters: {'learning_rate': 0.07301812357461686, 'num_leaves': 48, 'max_depth': 14, 'feature_fraction': 0.7155135955928442, 'bagging_fraction': 0.9086445904209469, 'bagging_freq': 5, 'min_data_in_leaf': 89, 'lambda_l1': 3.4949973876244522, 'lambda_l2': 1.208701967098282}. Best is trial 17 with value: 0.9821886460567716.


Early stopping, best iteration is:
[150]	valid_0's auc: 0.982189
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:47,781] Trial 18 finished with value: 0.9821236631415047 and parameters: {'learning_rate': 0.07120926658823105, 'num_leaves': 128, 'max_depth': 14, 'feature_fraction': 0.6847552831015986, 'bagging_fraction': 0.9096579870016636, 'bagging_freq': 5, 'min_data_in_leaf': 89, 'lambda_l1': 3.553461754063702, 'lambda_l2': 1.1898074906131517}. Best is trial 17 with value: 0.9821886460567716.


Early stopping, best iteration is:
[121]	valid_0's auc: 0.982124
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:52,109] Trial 19 finished with value: 0.9816365837448516 and parameters: {'learning_rate': 0.0180536659820629, 'num_leaves': 128, 'max_depth': 14, 'feature_fraction': 0.6825316405094388, 'bagging_fraction': 0.880346398452855, 'bagging_freq': 3, 'min_data_in_leaf': 100, 'lambda_l1': 4.995757157326409, 'lambda_l2': 0.867643759262466}. Best is trial 17 with value: 0.9821886460567716.


Did not meet early stopping. Best iteration is:
[194]	valid_0's auc: 0.981637
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:55,230] Trial 20 finished with value: 0.981971655557545 and parameters: {'learning_rate': 0.06142619972849399, 'num_leaves': 104, 'max_depth': 11, 'feature_fraction': 0.7527198812349678, 'bagging_fraction': 0.9459183178103429, 'bagging_freq': 5, 'min_data_in_leaf': 89, 'lambda_l1': 3.7740883863086663, 'lambda_l2': 1.153374369782301}. Best is trial 17 with value: 0.9821886460567716.


Early stopping, best iteration is:
[137]	valid_0's auc: 0.981972
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:39:58,271] Trial 21 finished with value: 0.9821931053312705 and parameters: {'learning_rate': 0.06036389701688897, 'num_leaves': 108, 'max_depth': 12, 'feature_fraction': 0.7315057258291888, 'bagging_fraction': 0.9452917185973672, 'bagging_freq': 5, 'min_data_in_leaf': 86, 'lambda_l1': 3.9343696523899667, 'lambda_l2': 1.1977201179904}. Best is trial 21 with value: 0.9821931053312705.


Early stopping, best iteration is:
[130]	valid_0's auc: 0.982193
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:40:02,644] Trial 22 finished with value: 0.9819165202067963 and parameters: {'learning_rate': 0.03643383833661827, 'num_leaves': 128, 'max_depth': 13, 'feature_fraction': 0.6531814884699494, 'bagging_fraction': 0.8716271979954714, 'bagging_freq': 4, 'min_data_in_leaf': 84, 'lambda_l1': 3.4062835874945825, 'lambda_l2': 1.7874702913827083}. Best is trial 21 with value: 0.9821931053312705.


Early stopping, best iteration is:
[175]	valid_0's auc: 0.981917
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:40:05,193] Trial 23 finished with value: 0.9821404748816293 and parameters: {'learning_rate': 0.07071548249223611, 'num_leaves': 115, 'max_depth': 15, 'feature_fraction': 0.7252262801385408, 'bagging_fraction': 0.9422448745980904, 'bagging_freq': 4, 'min_data_in_leaf': 92, 'lambda_l1': 4.472783038988172, 'lambda_l2': 0.9227322752497279}. Best is trial 21 with value: 0.9821931053312705.


Early stopping, best iteration is:
[91]	valid_0's auc: 0.98214
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[167]	valid_0's auc: 0.982151


[I 2025-06-16 13:40:18,672] Trial 24 finished with value: 0.982150618354795 and parameters: {'learning_rate': 0.03799223973194453, 'num_leaves': 105, 'max_depth': 15, 'feature_fraction': 0.8030930799039216, 'bagging_fraction': 0.9447810157081973, 'bagging_freq': 3, 'min_data_in_leaf': 93, 'lambda_l1': 4.326332010876492, 'lambda_l2': 0.6396657034399956}. Best is trial 21 with value: 0.9821931053312705.


Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[185]	valid_0's auc: 0.982053


[I 2025-06-16 13:40:31,093] Trial 25 finished with value: 0.982052982264378 and parameters: {'learning_rate': 0.03496855995301919, 'num_leaves': 96, 'max_depth': 12, 'feature_fraction': 0.7965652868477087, 'bagging_fraction': 0.9617098769348824, 'bagging_freq': 3, 'min_data_in_leaf': 83, 'lambda_l1': 4.270230926544752, 'lambda_l2': 0.6858962674651341}. Best is trial 21 with value: 0.9821931053312705.


Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:40:40,615] Trial 26 finished with value: 0.9820149545624014 and parameters: {'learning_rate': 0.043125728856241205, 'num_leaves': 113, 'max_depth': 15, 'feature_fraction': 0.7888969219552971, 'bagging_fraction': 0.8199694075436176, 'bagging_freq': 2, 'min_data_in_leaf': 95, 'lambda_l1': 2.7607106996778215, 'lambda_l2': 1.9079997928032837}. Best is trial 21 with value: 0.9821931053312705.


Early stopping, best iteration is:
[97]	valid_0's auc: 0.982015
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:40:50,174] Trial 27 finished with value: 0.9808949128059752 and parameters: {'learning_rate': 0.023938042123110605, 'num_leaves': 43, 'max_depth': 12, 'feature_fraction': 0.826589014889469, 'bagging_fraction': 0.8795328082874658, 'bagging_freq': 6, 'min_data_in_leaf': 81, 'lambda_l1': 4.906562978629591, 'lambda_l2': 1.4654134904414526}. Best is trial 21 with value: 0.9821931053312705.


Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.980895
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:40:58,043] Trial 28 finished with value: 0.9820798136089323 and parameters: {'learning_rate': 0.10065188835482072, 'num_leaves': 91, 'max_depth': 13, 'feature_fraction': 0.9321457434267331, 'bagging_fraction': 0.9690578237315238, 'bagging_freq': 2, 'min_data_in_leaf': 78, 'lambda_l1': 3.74392153379715, 'lambda_l2': 2.255676162023393}. Best is trial 21 with value: 0.9821931053312705.


Early stopping, best iteration is:
[103]	valid_0's auc: 0.98208
Training until validation scores don't improve for 20 rounds


[I 2025-06-16 13:41:05,759] Trial 29 finished with value: 0.9812666291197589 and parameters: {'learning_rate': 0.05208865322098913, 'num_leaves': 106, 'max_depth': 11, 'feature_fraction': 0.7193254471863219, 'bagging_fraction': 0.6116015187728692, 'bagging_freq': 4, 'min_data_in_leaf': 95, 'lambda_l1': 4.098335408378678, 'lambda_l2': 0.4990906467340883}. Best is trial 21 with value: 0.9821931053312705.


Early stopping, best iteration is:
[99]	valid_0's auc: 0.981267
Best params: {'learning_rate': 0.06036389701688897, 'num_leaves': 108, 'max_depth': 12, 'feature_fraction': 0.7315057258291888, 'bagging_fraction': 0.9452917185973672, 'bagging_freq': 5, 'min_data_in_leaf': 86, 'lambda_l1': 3.9343696523899667, 'lambda_l2': 1.1977201179904}


In [13]:
# Use best params and train final model
best_params = study.best_params
best_params["objective"] = "binary"
best_params["metric"] = "auc"
best_params["verbosity"] = -1

dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_test, label=y_test)
booster = lgb.train(
    best_params,
    dtrain,
    valid_sets=[dval],
    num_boost_round=200,
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(20)
    ]
)


# Save the model
booster.save_model('final_lightgbm_tuned.txt')


Training until validation scores don't improve for 20 rounds
[20]	valid_0's auc: 0.97785
[40]	valid_0's auc: 0.979886
[60]	valid_0's auc: 0.981022
[80]	valid_0's auc: 0.981625
[100]	valid_0's auc: 0.981836
[120]	valid_0's auc: 0.982084
[140]	valid_0's auc: 0.982041
Early stopping, best iteration is:
[122]	valid_0's auc: 0.982093


<lightgbm.basic.Booster at 0x2052e62ca10>

In [14]:
pred_probs = booster.predict(X_test)
threshold = 0.5
preds = (pred_probs > threshold).astype(int)

print("AUC:", roc_auc_score(y_test, pred_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
print("Classification Report:\n", classification_report(y_test, preds))


AUC: 0.982093115734868
Confusion Matrix:
 [[15148    81]
 [  655  4116]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     15229
           1       0.98      0.86      0.92      4771

    accuracy                           0.96     20000
   macro avg       0.97      0.93      0.95     20000
weighted avg       0.96      0.96      0.96     20000



In [15]:
booster.save_model('final_lightgbm_tuned.txt')


<lightgbm.basic.Booster at 0x2052e62ca10>

In [16]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

# 1. Load the test dataset
df = pd.read_parquet('D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned/Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet')  # Change path as needed

# 2. Encode the 'Label' column as 0 (Benign) and 1 (Attack)
df['Label'] = df['Label'].replace({'Benign': 0, 'Attack': 1}).astype(int)

# 3. Separate features and label
X_test = df.drop(columns=['Label'])
y_test = df['Label']

# 4. Optional: Keep only numeric columns
X_test = X_test.select_dtypes(include=np.number)

# 5. Load the pretrained LightGBM model
booster = lgb.Booster(model_file='final_lightgbm_tuned.txt')  # Change filename if different

# 6. Predict probabilities and classes
y_pred_probs = booster.predict(X_test)
y_pred = (y_pred_probs >= 0.5).astype(int)  # You can adjust threshold if needed

# 7. Evaluate the results
auc = roc_auc_score(y_test, y_pred_probs)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"AUC: {auc}")
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)

# 8. (Optional) Save results to CSV for further analysis
results = X_test.copy()
results['Actual'] = y_test
results['Predicted'] = y_pred
results['Pred_Prob'] = y_pred_probs
results.to_csv('test_results.csv', index=False)


AUC: 0.9999134772008106
Confusion Matrix:
 [[762237    147]
 [   322 285869]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    762384
           1       1.00      1.00      1.00    286191

    accuracy                           1.00   1048575
   macro avg       1.00      1.00      1.00   1048575
weighted avg       1.00      1.00      1.00   1048575



In [20]:
import numpy as np
import pandas as pd
import lightgbm as lgb

# 1. Feature order (from your list)
feature_names = [
    'Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts',
    'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
    'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std',
    'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
    'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
    'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
    'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
    'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
    'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
    'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
    'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std',
    'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'
]

# 2. Example attack values (based on common attack patterns, fill the rest as zeros if unsure)
attack_values = [
    80,    # Dst Port (common HTTP port)
    6,     # Protocol (TCP)
    200000, # Flow Duration (long connection)
    500,   # Tot Fwd Pkts
    1200,  # Tot Bwd Pkts
    100000, # TotLen Fwd Pkts
    300000, # TotLen Bwd Pkts
    1500,  # Fwd Pkt Len Max
    0, 0, 0,  # ... Min, Mean, Std (defaults)
    2000, 0, 0, 0,  # Bwd Pkt
    1e6, 200, 5000, 500, 9000, 1000, 50000, 1000, 12000, 100, 250000, 5000, 9000, 100, 500, 0, 0, 0, 0, 40, 40,
    10, 2000, 1200, 300, 80000, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 100, 500, 300, 200, 0, 0, 0, 0, 0, 0, 500, 50000, 200, 80000, 1024, 2048, 0, 40, 100, 0, 0, 0, 0, 0, 0
]
# Fill missing values with zeros if <78
if len(attack_values) < 78:
    attack_values += [0] * (78 - len(attack_values))

# 3. Convert to DataFrame
attack_df = pd.DataFrame([attack_values], columns=feature_names)

# 4. Load your pre-trained model
booster = lgb.Booster(model_file='final_lightgbm_tuned.txt')  # Change filename if needed

# 5. Predict
prob = booster.predict(attack_df)[0]
pred = int(prob >= 0.5)

print(f"Predicted probability of attack: {prob:.4f}")
print(f"Predicted class: {'Attack' if pred == 1 else 'Benign'}")


Predicted probability of attack: 0.1843
Predicted class: Benign


In [21]:
import pandas as pd
import numpy as np

# Load your data (already encoded with 0/1 in 'Label')
df = pd.read_parquet('D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned/Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet')

# Select feature columns (remove 'Label')
feature_cols = [col for col in df.columns if col != 'Label']

# 1. Sample a Benign flow
benign_sample = df[df['Label'] == 0].sample(n=1, random_state=42)[feature_cols]

# 2. Sample an Attack flow
attack_sample = df[df['Label'] == 1].sample(n=1, random_state=24)[feature_cols]


In [22]:
# Predict probability and class for Benign
benign_prob = booster.predict(benign_sample)[0]
benign_class = int(benign_prob >= 0.5)
print(f"Benign sample: Predicted probability of attack = {benign_prob:.4f} | Predicted class: {'Attack' if benign_class else 'Benign'}")

# Predict probability and class for Attack
attack_prob = booster.predict(attack_sample)[0]
attack_class = int(attack_prob >= 0.5)
print(f"Attack sample: Predicted probability of attack = {attack_prob:.4f} | Predicted class: {'Attack' if attack_class else 'Benign'}")


Benign sample: Predicted probability of attack = 0.0088 | Predicted class: Benign
Attack sample: Predicted probability of attack = 0.9967 | Predicted class: Attack


In [26]:
import pandas as pd
import numpy as np
import lightgbm as lgb

# -------- 1. Load Pretrained Model --------
booster = lgb.Booster(model_file='final_lightgbm_tuned.txt')

# -------- 2. Load Data --------
# Use the same file you trained on, or another file in same format
df = pd.read_parquet('D:/Canada/Subjects/Semester -2/AIDI-2005-02 CAPSTONE TERM ll/Bot_detector/data/cleaned/Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet')

# Make sure Label is encoded (for testing with real samples)
df['Label'] = df['Label'].replace({'Benign': 0, 'Attack': 1}).astype(int)

# Feature names as used for training
feature_names = [
    'Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
    'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
    'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
    'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
    'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len',
    'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
    'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
    'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
    'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg',
    'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Init Fwd Win Byts',
    'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max',
    'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'
]

# -------- 3. Sample Real Benign & Attack --------

# Grab a real benign row
benign_row = df[df['Label'] == 0].iloc[0][feature_names].to_frame().T

# Grab a real attack row
attack_row = df[df['Label'] == 1].iloc[0][feature_names].to_frame().T

# -------- 4. Create Synthetic "Extreme Attack" --------

extreme_attack = pd.DataFrame([[
    8080,   # Dst Port (unusual)
    17,     # Protocol (UDP)
    800000, # Flow Duration
    5000,   # Tot Fwd Pkts
    9000,   # Tot Bwd Pkts
    2_000_000, # TotLen Fwd Pkts
    2_200_000, # TotLen Bwd Pkts
    *([1000] * (78 - 7))  # Make other features large (or adjust as you wish)
]], columns=feature_names)

# -------- 5. Predict --------

for name, sample in [('Benign (real)', benign_row),
                     ('Attack (real)', attack_row),
                     ('Attack (synthetic extreme)', extreme_attack)]:
    prob = booster.predict(sample)[0]
    pred = 'Attack' if prob >= 0.5 else 'Benign'
    print(f"{name} sample: Prob = {prob:.4f} | Predicted: {pred}")



Benign (real) sample: Prob = 0.1488 | Predicted: Benign
Attack (real) sample: Prob = 0.9980 | Predicted: Attack
Attack (synthetic extreme) sample: Prob = 0.0381 | Predicted: Benign


In [2]:
#!/usr/bin/env python3
"""
scripts/make_classifier_pkl.py

Loads your trained LightGBM model (final_lightgbm_tuned.txt),
wraps it in an sklearn‐compatible interface, and saves it
to app/models/classifier.pkl so your FastAPI and Kafka‐scorer
can load it via joblib.load().
"""

import os
import joblib
import numpy as np
import lightgbm as lgb

# 1) Load your LightGBM booster from disk
MODEL_FILE = "final_lightgbm_tuned.txt"
booster = lgb.Booster(model_file=MODEL_FILE)

# 2) Define the exact feature order you trained on:
feature_names = [
    'Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
    'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
    'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
    'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
    'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
    'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
    'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
    'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
    'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
    'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
    'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
    'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
    'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
    'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts',
    'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts',
    'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std',
    'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'
]

# 3) Wrap in an sklearn‐compatible interface
class LGBMWrapper:
    def __init__(self, booster, feature_names):
        self.booster = booster
        self.feature_names = feature_names

    def predict_proba(self, X):
        """
        X: pandas.DataFrame or 2D numpy array.
        Returns an (n_samples, 2) array: [[P(class0), P(class1)], ...].
        """
        # If it's a DataFrame, reorder columns
        try:
            Xmat = X[self.feature_names].values.astype(np.float32)
        except Exception:
            # assume it's already a numpy array
            Xmat = np.asarray(X, dtype=np.float32)
        p1 = self.booster.predict(Xmat)
        p0 = 1.0 - p1
        return np.vstack([p0, p1]).T

    def predict(self, X, threshold=0.5):
        """
        Returns binary predictions (0 or 1) for class1 at given threshold.
        """
        proba = self.predict_proba(X)[:, 1]
        return (proba >= threshold).astype(int)

# 4) Instantiate and dump to classifier.pkl
clf = LGBMWrapper(booster, feature_names)

# Ensure output directory exists
os.makedirs("app/models", exist_ok=True)

# Save with joblib so FastAPI can load via joblib.load()
joblib.dump(clf, "app/models/classifier.pkl")
print("✅ Wrote app/models/classifier.pkl")


✅ Wrote app/models/classifier.pkl


In [2]:
import pandas as pd
import numpy as np

N = 2000  # number of synthetic logins

np.random.seed(42)
data = {
    "time_to_submit": np.concatenate([
        np.random.uniform(1.5, 12, N//2),     # humans: slower
        np.random.uniform(0.02, 0.6, N//2)    # bots: very fast
    ]),
    "user_agent": np.random.choice([
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/119.0.0.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Firefox/118.0",
        "curl/8.5.0", "python-requests/2.28.1", "go-http-client/1.1"
    ], N),
    "login_hour": np.random.choice(range(24), N),
    "client_ip": np.random.choice([
        "203.0.113.1", "203.0.113.2", "45.77.1.3", "51.15.39.9", "192.0.2.1"
    ], N),
    "password_length": np.random.choice(range(6, 16), N),
    "failed_login_count_last_10min": np.concatenate([
        np.random.poisson(0.4, N//2),    # humans: low fails
        np.random.poisson(3.5, N//2)     # bots: many fails
    ]),
    "is_username_email": np.random.choice([0, 1], N),
    "label": np.concatenate([
        np.zeros(N//2),  # human
        np.ones(N//2)    # attack
    ])
}

df = pd.DataFrame(data)

# Optionally, shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

print(df.head())
df.to_csv("synthetic_login_data.csv", index=False)
print("\n✅ Saved to synthetic_login_data.csv")


   time_to_submit                                         user_agent  \
0        8.089360                                         curl/8.5.0   
1        0.220738                                         curl/8.5.0   
2        9.482393  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chro...   
3        0.507608  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chro...   
4        8.934762                                 go-http-client/1.1   

   login_hour    client_ip  password_length  failed_login_count_last_10min  \
0          19   51.15.39.9                9                              0   
1           8    45.77.1.3               14                              2   
2          16   51.15.39.9                6                              0   
3          12  203.0.113.2               14                              2   
4           5   51.15.39.9                7                              0   

   is_username_email  label  
0                  1    0.0  
1                  1    1.0  
2       