In [1]:
#!/usr/bin/env python3
"""
Script to generate a balanced synthetic dataset for the login-behavior model.
Creates ~400,000 rows (200,000 benign, 200,000 attack) with 7 behavioral features
and a Label column (0=Benign, 1=Attack). Saves to balanced_login_data.csv.
"""

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
N_PER_CLASS = 200_000  # 200k benign + 200k attack
OUTPUT_PATH = "balanced_login_data.csv"

# Feature distributions
USER_AGENTS = [
    # Human-like
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:118.0) Gecko/20100101 Firefox/118.0",
    # Bot-like
    "curl/8.5.0",
    "python-requests/2.28.1",
    "go-http-client/1.1",
    "Wget/1.20.3",
    "Scrapy/2.7.0"
]
IP_POOL = [
    "203.0.113.1", "203.0.113.2", "45.77.1.3", "51.15.39.9", "192.0.2.1",
    # Additional IPs for diversity
    "192.168.1.100", "10.0.0.50", "172.16.0.10"
]

# Generate data
data = {
    "time_to_submit": np.concatenate([
        # Benign: Human-like, 1.5–12s with slight Gaussian noise
        np.random.uniform(1.5, 12, N_PER_CLASS) + np.random.normal(0, 0.2, N_PER_CLASS),
        # Attack: Bot-like, 0.02–0.6s
        np.random.uniform(0.02, 0.6, N_PER_CLASS) + np.random.normal(0, 0.05, N_PER_CLASS)
    ]),
    "user_agent": np.concatenate([
        # Benign: Mostly human-like browsers (80%), some bot-like (20%)
        np.random.choice(USER_AGENTS, N_PER_CLASS, p=[0.3, 0.3, 0.2, 0.05, 0.05, 0.05, 0.025, 0.025]),
        # Attack: Mostly bot-like (80%), some human-like (20%)
        np.random.choice(USER_AGENTS, N_PER_CLASS, p=[0.05, 0.05, 0.1, 0.3, 0.2, 0.2, 0.05, 0.05])
    ]),
    "login_hour": np.random.choice(range(24), 2 * N_PER_CLASS),  # Uniform for both
    "client_ip": np.random.choice(IP_POOL, 2 * N_PER_CLASS),  # Uniform for both
    "password_length": np.random.choice(range(6, 16), 2 * N_PER_CLASS),  # Uniform 6–15
    "failed_login_count_last_10min": np.concatenate([
        # Benign: Low fails (Poisson mean 0.4)
        np.random.poisson(0.4, N_PER_CLASS),
        # Attack: High fails (Poisson mean 3.5)
        np.random.poisson(3.5, N_PER_CLASS)
    ]),
    "is_username_email": np.random.choice([0, 1], 2 * N_PER_CLASS),  # Uniform for both
    "label": np.concatenate([
        np.zeros(N_PER_CLASS, dtype=int),  # Benign
        np.ones(N_PER_CLASS, dtype=int)   # Attack
    ])
}

# Create DataFrame
df = pd.DataFrame(data)

# Clip time_to_submit to ensure no negative values
df["time_to_submit"] = df["time_to_submit"].clip(lower=0.01)

# Shuffle rows
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Validate
print("Dataset Shape:", df.shape)
print("\nLabel Counts:\n", df["label"].value_counts())
print("\nFeature Summary:\n", df.describe(include="all"))
print("\nMissing Values:\n", df.isnull().sum())

# Save to CSV
df.to_csv(OUTPUT_PATH, index=False)
print(f"\n✅ Saved to {OUTPUT_PATH}")

Dataset Shape: (400000, 8)

Label Counts:
 label
0    200000
1    200000
Name: count, dtype: int64

Feature Summary:
         time_to_submit                                         user_agent  \
count    400000.000000                                             400000   
unique             NaN                                                  8   
top                NaN  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...   
freq               NaN                                              70339   
mean          3.532361                                                NaN   
std           3.872805                                                NaN   
min           0.010000                                                NaN   
25%           0.309767                                                NaN   
50%           0.838234                                                NaN   
75%           6.761922                                                NaN   
max          12.683748             

In [2]:
#!/usr/bin/env python3
"""
Script to train a LightGBM model on balanced_login_data.csv for login-behavior classification.
Encodes categorical features, splits data (90% train, 10% validation), tunes hyperparameters
with Optuna, and saves the model as app/models/login_classifier.pkl.
"""

import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import optuna

# Configuration
DATA_PATH = "balanced_login_data.csv"
MODEL_PATH = "app/models/login_classifier.pkl"
VALIDATION_SIZE = 0.1
N_TRIALS = 30  # Number of Optuna trials for hyperparameter tuning
RANDOM_SEED = 42

# Set random seed
np.random.seed(RANDOM_SEED)

# 1. Load and preprocess data
df = pd.read_csv(DATA_PATH)

# Encode categorical features
le_user_agent = LabelEncoder()
le_client_ip = LabelEncoder()
df["user_agent"] = le_user_agent.fit_transform(df["user_agent"])
df["client_ip"] = le_client_ip.fit_transform(df["client_ip"])

# Features and labels
feature_cols = [
    "time_to_submit",
    "user_agent",
    "login_hour",
    "client_ip",
    "password_length",
    "failed_login_count_last_10min",
    "is_username_email",
]
X = df[feature_cols]
y = df["label"]

# Split data (90% train, 10% validation, stratified)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=VALIDATION_SIZE, stratify=y, random_state=RANDOM_SEED
)

# 2. Define Optuna objective for hyperparameter tuning
def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "max_depth": trial.suggest_int("max_depth", 4, 16),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "scale_pos_weight": 1.0,  # Balanced dataset
    }
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)
    booster = lgb.train(
        params,
        dtrain,
        valid_sets=[dval],
        num_boost_round=200,
        callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(0)],
    )
    preds = booster.predict(X_val)
    auc = roc_auc_score(y_val, preds)
    return auc

# 3. Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=N_TRIALS)
print("Best params:", study.best_params)

# 4. Train final model with best parameters
best_params = study.best_params
best_params.update({"objective": "binary", "metric": "auc", "verbosity": -1})
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)
booster = lgb.train(
    best_params,
    dtrain,
    valid_sets=[dval],
    num_boost_round=200,
    callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(20)],
)

# 5. Evaluate on validation set with multiple thresholds
pred_probs = booster.predict(X_val)
thresholds = [0.5, 0.4, 0.3]
for threshold in thresholds:
    preds = (pred_probs >= threshold).astype(int)
    print(f"\n=== Results for threshold={threshold} ===")
    print("AUC:", roc_auc_score(y_val, pred_probs))
    print("Confusion Matrix:\n", confusion_matrix(y_val, preds))
    print("Classification Report:\n", classification_report(y_val, preds))

# 6. Feature importance
importance = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": booster.feature_importance(importance_type="gain"),
})
importance = importance.sort_values("Importance", ascending=False)
print("\nFeature Importance:\n", importance)

# 7. Wrap model in sklearn-compatible interface
class LGBMWrapper:
    def __init__(self, booster, feature_names, le_user_agent, le_client_ip):
        self.booster = booster
        self.feature_names = feature_names
        self.le_user_agent = le_user_agent
        self.le_client_ip = le_client_ip

    def predict_proba(self, X):
        # Handle DataFrame input
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            if "user_agent" in X.columns:
                X["user_agent"] = self.le_user_agent.transform(X["user_agent"])
            if "client_ip" in X.columns:
                X["client_ip"] = self.le_client_ip.transform(X["client_ip"])
            X = X[self.feature_names].values.astype(np.float32)
        else:
            X = np.asarray(X, dtype=np.float32)
        p1 = self.booster.predict(X)
        p0 = 1.0 - p1
        return np.vstack([p0, p1]).T

    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)[:, 1]
        return (proba >= threshold).astype(int)

# 8. Save model
clf = LGBMWrapper(booster, feature_cols, le_user_agent, le_client_ip)
os.makedirs("app/models", exist_ok=True)
joblib.dump(clf, MODEL_PATH)
print(f"\n✅ Saved model to {MODEL_PATH}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-17 11:16:46,810] A new study created in memory with name: no-name-3bfd8343-d857-458f-b2c9-00548d2d89b3


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[38]	valid_0's auc: 1


[I 2025-07-17 11:16:49,712] Trial 0 finished with value: 0.9999999075 and parameters: {'learning_rate': 0.16755889233749574, 'num_leaves': 16, 'max_depth': 13, 'feature_fraction': 0.6972529424542786, 'bagging_fraction': 0.8018172797108292, 'bagging_freq': 4, 'min_data_in_leaf': 33, 'lambda_l1': 2.480643012181809, 'lambda_l2': 0.8682096696093994}. Best is trial 0 with value: 0.9999999075.


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[96]	valid_0's auc: 1


[I 2025-07-17 11:16:54,237] Trial 1 finished with value: 0.999999905 and parameters: {'learning_rate': 0.08197881344662493, 'num_leaves': 40, 'max_depth': 7, 'feature_fraction': 0.6130251396579516, 'bagging_fraction': 0.5480146493991442, 'bagging_freq': 9, 'min_data_in_leaf': 89, 'lambda_l1': 0.16368394435161027, 'lambda_l2': 3.7726843362704776}. Best is trial 0 with value: 0.9999999075.


Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:16:58,022] Trial 2 finished with value: 0.99999990625 and parameters: {'learning_rate': 0.09066279879422103, 'num_leaves': 33, 'max_depth': 9, 'feature_fraction': 0.7262019355907539, 'bagging_fraction': 0.9708346908610962, 'bagging_freq': 3, 'min_data_in_leaf': 41, 'lambda_l1': 1.8037298305126714, 'lambda_l2': 3.392453239781948}. Best is trial 0 with value: 0.9999999075.


Early stopping, best iteration is:
[54]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:16:58,393] Trial 3 finished with value: 0.9999999150000001 and parameters: {'learning_rate': 0.15263270352030653, 'num_leaves': 74, 'max_depth': 16, 'feature_fraction': 0.9018258180235299, 'bagging_fraction': 0.7175954528853183, 'bagging_freq': 2, 'min_data_in_leaf': 36, 'lambda_l1': 0.6563915189783964, 'lambda_l2': 1.8745909300963608}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[33]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:16:58,613] Trial 4 finished with value: 0.99999932625 and parameters: {'learning_rate': 0.04958469001009245, 'num_leaves': 128, 'max_depth': 4, 'feature_fraction': 0.8550814857721271, 'bagging_fraction': 0.7511257639832516, 'bagging_freq': 10, 'min_data_in_leaf': 44, 'lambda_l1': 1.1003250067417754, 'lambda_l2': 1.5423139993417834}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[1]	valid_0's auc: 0.999999
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:16:59,172] Trial 5 finished with value: 0.9999999099999999 and parameters: {'learning_rate': 0.10543667453741175, 'num_leaves': 121, 'max_depth': 9, 'feature_fraction': 0.5229651974833216, 'bagging_fraction': 0.5278479412700054, 'bagging_freq': 3, 'min_data_in_leaf': 89, 'lambda_l1': 0.8625513423973569, 'lambda_l2': 4.257377760581925}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[70]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:16:59,549] Trial 6 finished with value: 0.99999991375 and parameters: {'learning_rate': 0.14445712017752, 'num_leaves': 68, 'max_depth': 14, 'feature_fraction': 0.7360245903358317, 'bagging_fraction': 0.9301576806584424, 'bagging_freq': 9, 'min_data_in_leaf': 65, 'lambda_l1': 1.6559404564169662, 'lambda_l2': 1.723235100560558}. Best is trial 3 with value: 0.9999999150000001.
[I 2025-07-17 11:16:59,711] Trial 7 finished with value: 0.99999932625 and parameters: {'learning_rate': 0.013224869005018763, 'num_leaves': 32, 'max_depth': 9, 'feature_fraction': 0.9801073871641772, 'bagging_fraction': 0.7268806751454806, 'bagging_freq': 7, 'min_data_in_leaf': 44, 'lambda_l1': 2.0356859978780046, 'lambda_l2': 2.4221343946487393}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[42]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.999999


[I 2025-07-17 11:16:59,991] Trial 8 finished with value: 0.99999727 and parameters: {'learning_rate': 0.015430008066442754, 'num_leaves': 119, 'max_depth': 5, 'feature_fraction': 0.5797174463093162, 'bagging_fraction': 0.6006899416906202, 'bagging_freq': 3, 'min_data_in_leaf': 56, 'lambda_l1': 3.343341247877305, 'lambda_l2': 2.0586725429188006}. Best is trial 3 with value: 0.9999999150000001.


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.999997
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:00,364] Trial 9 finished with value: 0.9999998925 and parameters: {'learning_rate': 0.07673401052256222, 'num_leaves': 91, 'max_depth': 4, 'feature_fraction': 0.9660461339951667, 'bagging_fraction': 0.9162145087953975, 'bagging_freq': 7, 'min_data_in_leaf': 57, 'lambda_l1': 0.3531576041708029, 'lambda_l2': 1.816671988543856}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[60]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:00,685] Trial 10 finished with value: 0.9999999075 and parameters: {'learning_rate': 0.19910771680586864, 'num_leaves': 75, 'max_depth': 16, 'feature_fraction': 0.861254511492092, 'bagging_fraction': 0.6611656136058659, 'bagging_freq': 1, 'min_data_in_leaf': 22, 'lambda_l1': 4.9585937701926, 'lambda_l2': 0.022150941694726534}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[37]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:01,038] Trial 11 finished with value: 0.99999991375 and parameters: {'learning_rate': 0.14023652777608098, 'num_leaves': 63, 'max_depth': 16, 'feature_fraction': 0.829205573063196, 'bagging_fraction': 0.846885641478154, 'bagging_freq': 6, 'min_data_in_leaf': 72, 'lambda_l1': 1.357168581650706, 'lambda_l2': 3.0023235135848862}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[31]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:01,390] Trial 12 finished with value: 0.99999990625 and parameters: {'learning_rate': 0.13723884418934845, 'num_leaves': 88, 'max_depth': 13, 'feature_fraction': 0.7780850803478094, 'bagging_fraction': 0.9030591244815062, 'bagging_freq': 1, 'min_data_in_leaf': 70, 'lambda_l1': 3.339668217471817, 'lambda_l2': 0.9238290171730787}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[42]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:01,781] Trial 13 finished with value: 0.9999999125000001 and parameters: {'learning_rate': 0.1410644279451928, 'num_leaves': 59, 'max_depth': 13, 'feature_fraction': 0.9052026842543608, 'bagging_fraction': 0.9990064703920745, 'bagging_freq': 8, 'min_data_in_leaf': 73, 'lambda_l1': 3.1294197511617083, 'lambda_l2': 1.3338311515412768}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[46]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:02,153] Trial 14 finished with value: 0.9999999075 and parameters: {'learning_rate': 0.176944914912552, 'num_leaves': 99, 'max_depth': 14, 'feature_fraction': 0.6570930475399449, 'bagging_fraction': 0.6820257944434062, 'bagging_freq': 5, 'min_data_in_leaf': 20, 'lambda_l1': 0.575905654921562, 'lambda_l2': 2.600223817578467}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[31]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:02,494] Trial 15 finished with value: 0.9999999125000001 and parameters: {'learning_rate': 0.11878502363819958, 'num_leaves': 55, 'max_depth': 11, 'feature_fraction': 0.7980412239546281, 'bagging_fraction': 0.8075661327847049, 'bagging_freq': 10, 'min_data_in_leaf': 59, 'lambda_l1': 1.4863760226868634, 'lambda_l2': 0.23936161739109707}. Best is trial 3 with value: 0.9999999150000001.


Early stopping, best iteration is:
[35]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:02,933] Trial 16 finished with value: 0.9999999199999999 and parameters: {'learning_rate': 0.16296242250101053, 'num_leaves': 75, 'max_depth': 15, 'feature_fraction': 0.9199843192078083, 'bagging_fraction': 0.6271157784617101, 'bagging_freq': 5, 'min_data_in_leaf': 34, 'lambda_l1': 0.0009722459428442853, 'lambda_l2': 4.9156352459893995}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[40]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:03,350] Trial 17 finished with value: 0.99999991625 and parameters: {'learning_rate': 0.17113699148596795, 'num_leaves': 78, 'max_depth': 16, 'feature_fraction': 0.9173037319605992, 'bagging_fraction': 0.6192764826962867, 'bagging_freq': 2, 'min_data_in_leaf': 31, 'lambda_l1': 0.16309550287967747, 'lambda_l2': 4.912349372326578}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[44]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:03,779] Trial 18 finished with value: 0.9999999150000001 and parameters: {'learning_rate': 0.19892712226105108, 'num_leaves': 105, 'max_depth': 15, 'feature_fraction': 0.9064371285593952, 'bagging_fraction': 0.6117001416192767, 'bagging_freq': 5, 'min_data_in_leaf': 28, 'lambda_l1': 0.12660964238765882, 'lambda_l2': 4.775969748560408}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[40]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:04,091] Trial 19 finished with value: 0.99999991 and parameters: {'learning_rate': 0.17470887006116242, 'num_leaves': 83, 'max_depth': 11, 'feature_fraction': 0.992457888518927, 'bagging_fraction': 0.5971885342846321, 'bagging_freq': 4, 'min_data_in_leaf': 50, 'lambda_l1': 4.404039051987892, 'lambda_l2': 4.775629855516517}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[32]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:04,404] Trial 20 finished with value: 0.99999989125 and parameters: {'learning_rate': 0.12032987718826513, 'num_leaves': 47, 'max_depth': 11, 'feature_fraction': 0.9559282917518686, 'bagging_fraction': 0.50655646801848, 'bagging_freq': 2, 'min_data_in_leaf': 100, 'lambda_l1': 0.026686299884650606, 'lambda_l2': 4.015999400410618}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[32]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:04,770] Trial 21 finished with value: 0.9999999125 and parameters: {'learning_rate': 0.15950675312563262, 'num_leaves': 76, 'max_depth': 16, 'feature_fraction': 0.9100440991458832, 'bagging_fraction': 0.6749893852573191, 'bagging_freq': 2, 'min_data_in_leaf': 34, 'lambda_l1': 0.73192594892487, 'lambda_l2': 4.8937533564501505}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[31]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:05,102] Trial 22 finished with value: 0.99999990625 and parameters: {'learning_rate': 0.18493841530790703, 'num_leaves': 79, 'max_depth': 15, 'feature_fraction': 0.9164304704704612, 'bagging_fraction': 0.7184038064008764, 'bagging_freq': 2, 'min_data_in_leaf': 32, 'lambda_l1': 0.98895944673971, 'lambda_l2': 4.356933535314379}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[24]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:05,490] Trial 23 finished with value: 0.99999991875 and parameters: {'learning_rate': 0.15593815447923215, 'num_leaves': 101, 'max_depth': 15, 'feature_fraction': 0.8613776250214502, 'bagging_fraction': 0.6451788562292836, 'bagging_freq': 4, 'min_data_in_leaf': 39, 'lambda_l1': 0.5320485604380685, 'lambda_l2': 3.339353630922549}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[45]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:05,798] Trial 24 finished with value: 0.99999991 and parameters: {'learning_rate': 0.12530987674674804, 'num_leaves': 105, 'max_depth': 14, 'feature_fraction': 0.8353415872573752, 'bagging_fraction': 0.6303867084899133, 'bagging_freq': 4, 'min_data_in_leaf': 28, 'lambda_l1': 0.3915306523357245, 'lambda_l2': 3.311696238447647}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[30]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:06,144] Trial 25 finished with value: 0.9999999175000001 and parameters: {'learning_rate': 0.16023793957116927, 'num_leaves': 99, 'max_depth': 12, 'feature_fraction': 0.9440470880800094, 'bagging_fraction': 0.5689443451593756, 'bagging_freq': 6, 'min_data_in_leaf': 41, 'lambda_l1': 2.236507965874025, 'lambda_l2': 4.352664725552126}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[47]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:06,490] Trial 26 finished with value: 0.99999991625 and parameters: {'learning_rate': 0.15636699907341706, 'num_leaves': 96, 'max_depth': 12, 'feature_fraction': 0.8725270989345837, 'bagging_fraction': 0.5583009645067586, 'bagging_freq': 6, 'min_data_in_leaf': 48, 'lambda_l1': 2.520541126048929, 'lambda_l2': 3.647427238080839}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[45]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:06,799] Trial 27 finished with value: 0.99999991125 and parameters: {'learning_rate': 0.18581370536920896, 'num_leaves': 107, 'max_depth': 12, 'feature_fraction': 0.945422112972932, 'bagging_fraction': 0.5694345593500283, 'bagging_freq': 5, 'min_data_in_leaf': 51, 'lambda_l1': 3.949250025848094, 'lambda_l2': 4.405006327228879}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[42]	valid_0's auc: 1
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:07,023] Trial 28 finished with value: 0.99999932625 and parameters: {'learning_rate': 0.10459763514163894, 'num_leaves': 115, 'max_depth': 15, 'feature_fraction': 0.7954140887680629, 'bagging_fraction': 0.6472013778639256, 'bagging_freq': 7, 'min_data_in_leaf': 38, 'lambda_l1': 2.301260470527981, 'lambda_l2': 3.0059507146883675}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[1]	valid_0's auc: 0.999999
Training until validation scores don't improve for 20 rounds


[I 2025-07-17 11:17:07,293] Trial 29 finished with value: 0.99999991 and parameters: {'learning_rate': 0.16398881909552104, 'num_leaves': 89, 'max_depth': 13, 'feature_fraction': 0.9971661139039512, 'bagging_fraction': 0.7768452992905411, 'bagging_freq': 6, 'min_data_in_leaf': 27, 'lambda_l1': 2.4126469095911363, 'lambda_l2': 4.028260777047949}. Best is trial 16 with value: 0.9999999199999999.


Early stopping, best iteration is:
[26]	valid_0's auc: 1
Best params: {'learning_rate': 0.16296242250101053, 'num_leaves': 75, 'max_depth': 15, 'feature_fraction': 0.9199843192078083, 'bagging_fraction': 0.6271157784617101, 'bagging_freq': 5, 'min_data_in_leaf': 34, 'lambda_l1': 0.0009722459428442853, 'lambda_l2': 4.9156352459893995}
Training until validation scores don't improve for 20 rounds
[20]	valid_0's auc: 1
[40]	valid_0's auc: 1
[60]	valid_0's auc: 1
Early stopping, best iteration is:
[40]	valid_0's auc: 1

=== Results for threshold=0.5 ===
AUC: 0.9999999199999999
Confusion Matrix:
 [[19996     4]
 [    5 19995]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       1.00      1.00      1.00     20000

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000


=== Results for thr

In [1]:
#!/usr/bin/env python3
"""
Script to train a slim LightGBM model on balanced_login_data.csv using the top 3 features
(time_to_submit, failed_login_count_last_10min, user_agent). Saves the model as
app/models/slim_login_classifier.pkl.
"""

import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Configuration
DATA_PATH = "balanced_login_data.csv"
MODEL_PATH = "app/models/slim_login_classifier.pkl"
VALIDATION_SIZE = 0.1
RANDOM_SEED = 42
BEST_PARAMS = {
    "learning_rate": 0.16296242250101053,
    "num_leaves": 75,
    "max_depth": 15,
    "feature_fraction": 0.9199843192078083,
    "bagging_fraction": 0.6271157784617101,
    "bagging_freq": 5,
    "min_data_in_leaf": 34,
    "lambda_l1": 0.0009722459428442853,
    "lambda_l2": 4.9156352459893995,
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
}

# Set random seed
np.random.seed(RANDOM_SEED)

# 1. Load and preprocess data
df = pd.read_csv(DATA_PATH)

# Encode categorical feature
le_user_agent = LabelEncoder()
df["user_agent"] = le_user_agent.fit_transform(df["user_agent"])

# Select top 3 features
feature_cols = [
    "time_to_submit",
    "failed_login_count_last_10min",
    "user_agent",
]
X = df[feature_cols]
y = df["label"]

# Split data (90% train, 10% validation, stratified)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=VALIDATION_SIZE, stratify=y, random_state=RANDOM_SEED
)

# 2. Train model with best parameters
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)
booster = lgb.train(
    BEST_PARAMS,
    dtrain,
    valid_sets=[dval],
    num_boost_round=200,
    callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(20)],
)

# 3. Evaluate on validation set with multiple thresholds
pred_probs = booster.predict(X_val)
thresholds = [0.5, 0.4, 0.3]
for threshold in thresholds:
    preds = (pred_probs >= threshold).astype(int)
    print(f"\n=== Results for threshold={threshold} ===")
    print("AUC:", roc_auc_score(y_val, pred_probs))
    print("Confusion Matrix:\n", confusion_matrix(y_val, preds))
    print("Classification Report:\n", classification_report(y_val, preds))

# 4. Feature importance
importance = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": booster.feature_importance(importance_type="gain"),
})
importance = importance.sort_values("Importance", ascending=False)
print("\nFeature Importance:\n", importance)

# 5. Wrap model in sklearn-compatible interface
class LGBMWrapper:
    def __init__(self, booster, feature_names, le_user_agent):
        self.booster = booster
        self.feature_names = feature_names
        self.le_user_agent = le_user_agent

    def predict_proba(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            if "user_agent" in X.columns:
                X["user_agent"] = self.le_user_agent.transform(X["user_agent"])
            X = X[self.feature_names].values.astype(np.float32)
        else:
            X = np.asarray(X, dtype=np.float32)
        p1 = self.booster.predict(X)
        p0 = 1.0 - p1
        return np.vstack([p0, p1]).T

    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)[:, 1]
        return (proba >= threshold).astype(int)

# 6. Save model
clf = LGBMWrapper(booster, feature_cols, le_user_agent)
os.makedirs("app/models", exist_ok=True)
joblib.dump(clf, MODEL_PATH)
print(f"\n✅ Saved model to {MODEL_PATH}")

Training until validation scores don't improve for 20 rounds
[20]	valid_0's auc: 1
[40]	valid_0's auc: 1
[60]	valid_0's auc: 1
Early stopping, best iteration is:
[43]	valid_0's auc: 1

=== Results for threshold=0.5 ===
AUC: 0.9999999175000001
Confusion Matrix:
 [[19996     4]
 [    5 19995]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       1.00      1.00      1.00     20000

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000


=== Results for threshold=0.4 ===
AUC: 0.9999999175000001
Confusion Matrix:
 [[19996     4]
 [    1 19999]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       1.00      1.00      1.00     20000

    accuracy                           1.00     40000
   ma

In [8]:
#!/usr/bin/env python3
"""
Script to create an ensemble classifier combining the slim login-behavior model
and network-flow model. Generates a synthetic test dataset, evaluates the ensemble
with adjustable threshold and logic, and saves it as app/models/ensemble_classifier.pkl.
"""

import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
import os
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Configuration
LOGIN_MODEL_PATH = "app/models/slim_login_classifier.pkl"
NETWORK_MODEL_PATH = "D:\\Canada\\Subjects\\Semester -2\\AIDI-2005-02 CAPSTONE TERM ll\\Bot_detector\\notebooks\\final_lightgbm_tuned.txt"
ENSEMBLE_MODEL_PATH = "app/models/ensemble_classifier.pkl"
TEST_SIZE = 10000
RANDOM_SEED = 42
THRESHOLD = 0.5  # Increased to 0.5 for better balance
USE_MAX_PROB = False  # Set to True for max probability rule instead of averaging

# Set random seed
np.random.seed(RANDOM_SEED)

# 1. Generate synthetic test dataset
def generate_synthetic_test_data(n_samples=TEST_SIZE):
    n_benign = n_samples // 2
    n_attack = n_samples - n_benign
    data = {
        # Login-behavior features (from balanced_login_data.csv distributions)
        "time_to_submit": np.concatenate([
            np.random.uniform(1.5, 12.0, n_benign),  # Benign: 1.5-12s
            np.random.uniform(0.02, 0.6, n_attack)   # Attack: 0.02-0.6s
        ]),
        "failed_login_count_last_10min": np.concatenate([
            np.random.randint(0, 3, n_benign),       # Benign: 0-2
            np.random.randint(3, 16, n_attack)       # Attack: 3-15
        ]),
        "user_agent": np.concatenate([
            np.random.choice(
                ["Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)", "Chrome/90.0"],
                n_benign, p=[0.4, 0.3, 0.3]
            ),
            np.random.choice(
                ["curl/8.5.0", "python-requests/2.28.1", "BotAgent"],
                n_attack, p=[0.5, 0.3, 0.2]
            )
        ]),
        # Placeholder network-flow features (78 features, simplified distributions)
        **{
            f"flow_feature_{i}": np.random.uniform(0, 100, n_samples) for i in range(78)
        },
        "label": np.concatenate([np.zeros(n_benign), np.ones(n_attack)])
    }
    df = pd.DataFrame(data)
    return df

# 2. Load models
login_model = joblib.load(LOGIN_MODEL_PATH)
network_model = lgb.Booster(model_file=NETWORK_MODEL_PATH)

# 3. Ensemble classifier
class EnsembleClassifier:
    def __init__(self, login_model, network_model, login_features, network_features, threshold=0.3):
        self.login_model = login_model
        self.network_model = network_model
        self.login_features = login_features
        self.network_features = network_features
        self.threshold = threshold
        self.le_user_agent = LabelEncoder()  # Initialize new LabelEncoder

    def fit_encoders(self, X):
        # Fit LabelEncoder on user_agent values from the input data
        self.le_user_agent.fit(X["user_agent"])

    def predict_proba(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            if "user_agent" in X.columns:
                X["user_agent"] = self.le_user_agent.transform(X["user_agent"])
            X_login = X[self.login_features].values.astype(np.float32)
            X_network = X[self.network_features].values.astype(np.float32)
        else:
            X_login = X[:, :len(self.login_features)].astype(np.float32)
            X_network = X[:, len(self.login_features):].astype(np.float32)
        login_probs = self.login_model.predict_proba(X_login)[:, 1]
        network_probs = self.network_model.predict(X_network)
        if USE_MAX_PROB:
            ensemble_probs = np.maximum(login_probs, network_probs)
        else:
            ensemble_probs = (login_probs + network_probs) / 2
        return np.vstack([1 - ensemble_probs, ensemble_probs]).T

    def predict(self, X):
        proba = self.predict_proba(X)[:, 1]
        return (proba >= self.threshold).astype(int)

# 4. Generate and preprocess test data
test_df = generate_synthetic_test_data()
login_features = ["time_to_submit", "failed_login_count_last_10min", "user_agent"]
network_features = [f"flow_feature_{i}" for i in range(78)]
X_test = test_df[login_features + network_features]
y_test = test_df["label"]

# 5. Create and evaluate ensemble
ensemble = EnsembleClassifier(login_model, network_model, login_features, network_features, THRESHOLD)
ensemble.fit_encoders(X_test)  # Fit LabelEncoder on test data
pred_probs = ensemble.predict_proba(X_test)[:, 1]
preds = ensemble.predict(X_test)

# 6. Print evaluation metrics
print("\n=== Ensemble Results ===")
print("AUC:", roc_auc_score(y_test, pred_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
print("Classification Report:\n", classification_report(y_test, preds))

# 7. Save ensemble model
os.makedirs("app/models", exist_ok=True)
joblib.dump(ensemble, ENSEMBLE_MODEL_PATH)
print(f"\n✅ Saved ensemble model to {ENSEMBLE_MODEL_PATH}")


=== Ensemble Results ===
AUC: 1.0
Confusion Matrix:
 [[5000    0]
 [   0 5000]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5000
         1.0       1.00      1.00      1.00      5000

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000


✅ Saved ensemble model to app/models/ensemble_classifier.pkl
