# Phase 2: Model Training and Evaluation (Notebook)

This notebook mirrors the training script in `src/train_models.py`. If you prefer a notebook, run the cells below.

In [None]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import joblib


In [None]:
# Load data (adjust path as needed)
CSV_PATH = '../data/cleaned_phishing_dataset.csv'
data = pd.read_csv(CSV_PATH)
display(data.head())


In [None]:
# Split features/target and train-test split
X = data.drop('label', axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# Train and evaluate multiple models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=300,
                              learning_rate=0.1, max_depth=6, subsample=0.9, colsample_bytree=0.9, n_jobs=-1)
}

best_name, best_model, best_f1 = None, None, -1
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"=== {name} ===")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n" + "-"*60 + "\n")
    if f1 > best_f1:
        best_name, best_model, best_f1 = name, model, f1

print("Best model:", best_name, "with weighted F1:", best_f1)


In [None]:
# Save best model and scaler
import os
os.makedirs('../models', exist_ok=True)
joblib.dump(best_model, '../models/phishing_detector_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
print('Model and scaler saved to ../models')