<a href="https://colab.research.google.com/github/akivig1601/Website-Phishing-Detection/blob/main/phishing_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Imports and Environment Setup

In [None]:
import subprocess, sys

# Install required packages
for package in ['xgboost', 'optuna', 'shap', 'scikit-learn', 'seaborn', 'matplotlib', 'pandas', 'numpy']:
    try:
        __import__(package)
        print(f"{package}: OK")
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])


xgboost: OK
shap: OK


In [None]:
import subprocess, sys

# Install required packages
for package in ['xgboost', 'optuna', 'shap', 'scikit-learn', 'seaborn', 'matplotlib', 'pandas', 'numpy']:
    try:
        __import__(package)
        print(f"{package}: OK")
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])


📥 1. Load Dataset

In [None]:
from scipy.io import arff
data, meta = arff.loadarff('Training Dataset.arff')
df = pd.DataFrame(data).applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df = df.apply(pd.to_numeric, errors='coerce').dropna()
print(df.head())

3. EDA and Feature Engineering

EDA

In [None]:
print("Dataset shape:", df.shape)
print(df["Result"].value_counts())
sns.countplot(x=df["Result"].map({-1: 'Phishing', 1: 'Legitimate'}))
plt.title("Class Distribution")
plt.show()

plt.figure(figsize=(12, 10))
sns.heatmap(df.drop('Result', axis=1).corr(), cmap='coolwarm')
plt.title("Feature Correlation")
plt.show()

Feature Engineering: N-gram features

Synthesize URL-like strings from features:

In [None]:
def create_url_features(row):
    url_parts = []
    if 'having_Sub_Domain' in row.index and row['having_Sub_Domain'] == 1:
        url_parts.append('subdomain.suspicious')
    if 'URL_Length' in row.index:
        l = row['URL_Length']
        if l > 75:
            url_parts.append('verylongurl')
        elif l > 54:
            url_parts.append('longurl')
    if 'having_At_Symbol' in row.index and row['having_At_Symbol'] == 1:
        url_parts.append('at@symbol')
    if 'double_slash_redirecting' in row.index and row['double_slash_redirecting'] == 1:
        url_parts.append('redirect//')
    return '.'.join(url_parts) if url_parts else 'standard.url.com'

df['synthetic_url'] = df.drop('Result', axis=1).apply(create_url_features, axis=1)


Extract n-gram features:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_ngram_features(urls, ngram_range=(2, 4), max_features=40):
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=ngram_range, max_features=max_features, lowercase=True)
    ngram_matrix = vectorizer.fit_transform(urls)
    feature_names = [f'ngram_{name}' for name in vectorizer.get_feature_names_out()]
    return ngram_matrix.toarray(), feature_names

ngram_features, ngram_names = extract_ngram_features(df['synthetic_url'])
ngram_df = pd.DataFrame(ngram_features, columns=ngram_names, index=df.index)


4. Preprocessing

In [None]:
X_original = df.drop(columns=["Result", "synthetic_url"])
X_enhanced = pd.concat([X_original, ngram_df], axis=1)
y = df["Result"]
y_binary = y.map({-1: 0, 1: 1})  # For XGBoost & ML compatibility

X_train, X_test, y_train, y_test = train_test_split(
    X_enhanced, y_binary, stratify=y_binary, test_size=0.2, random_state=RANDOM_STATE
)

5. Model Definitions and Training

In [None]:
models = {
    "Logistic Regression": Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression())
    ]),
    "Decision Tree": Pipeline([
        ('clf', DecisionTreeClassifier())
    ]),
    "Random Forest": Pipeline([
        ('clf', RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))
    ]),
    "SVM (RBF)": Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC(kernel='rbf', probability=True))
    ]),
    "Gradient Boosting": Pipeline([
        ('clf', GradientBoostingClassifier(random_state=RANDOM_STATE))
    ])
}

Train and Evaluate

In [None]:
results = {}
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe.named_steps['clf'], 'predict_proba') else None

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": classification_report(y_test, y_pred, output_dict=True)[str(1)]["f1-score"] if "1" in classification_report(y_test, y_pred, output_dict=True) else 0,
        "ROC AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }

results_df = pd.DataFrame(results).T.sort_values("Accuracy", ascending=False)
print(results_df.round(4))


6. XGBoost With Optuna Hyperparameter Optimization

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 3),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 3),
        'random_state': RANDOM_STATE,
        'verbosity': 0
    }
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, show_progress_bar=True)
optimized_xgb = xgb.XGBClassifier(**study.best_params)
optimized_xgb.fit(X_train, y_train)
models["Optimized XGBoost"] = Pipeline([('clf', optimized_xgb)])

7. SHAP Explainability

In [None]:
X_test_sample = X_test.iloc[:30].values
feature_names = X_test.columns.tolist()
explainer = shap.TreeExplainer(optimized_xgb)
shap_values = explainer.shap_values(X_test_sample)
feature_importance_shap = pd.DataFrame({
    'feature': feature_names,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values('importance', ascending=False)
print(feature_importance_shap.head(10))
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_shap['feature'][:10], feature_importance_shap['importance'][:10])
plt.xlabel('Mean |SHAP Value|')
plt.title('Top 10 SHAP Features')
plt.gca().invert_yaxis()
plt.show()

8. Stacking Ensemble

In [None]:
from sklearn.ensemble import StackingClassifier

base_learners = [
    ('lr', Pipeline([('scale', StandardScaler()), ('clf', LogisticRegression())])),
    ('rf', Pipeline([('clf', RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))])),
    ('xgb', Pipeline([('clf', optimized_xgb)])),
    ('svm', Pipeline([('scale', StandardScaler()), ('clf', SVC(kernel="rbf", probability=True))]))
]
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(random_state=RANDOM_STATE),
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print('Stacking Ensemble Accuracy:', accuracy_score(y_test, y_pred_stack))
models["Stacking Ensemble"] = stacking_clf

9. Performance Visualization

In [None]:
import time
results_enhanced = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    results_enhanced[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": classification_report(y_test, y_pred, output_dict=True)['1']["f1-score"] if "1" in classification_report(y_test, y_pred, output_dict=True) else 0,
        "ROC AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
    }
results_df = pd.DataFrame(results_enhanced).T.round(4)
results_df.plot(kind='bar', figsize=(15,6))
plt.title("Model Performance Comparison")
plt.tight_layout()
plt.show()