<a href="https://colab.research.google.com/github/stiwari-ds/DL-Simplified-SWOC-S3/blob/phishing-website-detection/Phishing-Website-Detection/Model/01_ml_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [18]:
#Utils
import gc
import os
import warnings

gc.enable()
warnings.filterwarnings(action='ignore')

#Data analysis
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'png'
mpl.rcParams["axes.spines.right"] = False
mpl.rcParams["axes.spines.top"] = False

#Reproducibility
SEED = 2311
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [19]:
#modeling
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

# Data

**Cleaned data and feature sets (from [EDA notebook](https://github.com/stiwari-ds/DL-Simplified-SWOC-S3/blob/phishing-website-detection/Phishing-Website-Detection/Model/00_eda_and_preprocessing.ipynb))**

In [20]:
#data: cleaned version
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/DL-Simplified-SWOC-S3/phishing-website-detection/Phishing-Website-Detection/Dataset/processed/phishing_website_dataset_cleaned.csv'
data = pd.read_csv(DATA_URL)

In [21]:
TARGET = 'result'

all_features = [f for f in data.columns if f != TARGET]

reduced_features = [
    f for f in all_features if f not in 
    ('favicon', 'submitting_to_email', 'redirect', 'right_click',
    'popup_window', 'iframe', 'links_pointing_to_page')
]

top_features = [
    'ssl_final_state', 'url_of_anchor', 'prefix_suffix', 
    'web_traffic', 'having_subdomain', 'links_in_tags', 
    'request_url', 'sfh', 'domain_registration_length'
]

# Modeling

In [22]:
model1 = LogisticRegression(C=0.1, max_iter=1000, random_state=SEED)

model2 = Pipeline([
    ('pca', PCA(n_components=0.95)),
    ('lr', LogisticRegression(C=0.1, max_iter=1000, random_state=SEED))
])

model3 = RandomForestClassifier(
    n_estimators=150, 
    max_depth=7, 
    n_jobs=-1,
    random_state=SEED
)

In [23]:
def evaluate(model, feature_set, seed):
    X_train, X_val, y_train, y_val = train_test_split(
        data[feature_set], data[TARGET], 
        test_size=0.2, 
        shuffle=True, 
        stratify=data[TARGET],
        random_state=seed
    )
    
    model.fit(X_train, y_train)
    val_probs = model.predict_proba(X_val)[:, 1]
    val_preds = model.predict(X_val)

    auc = roc_auc_score(y_val, val_probs)
    accuracy = accuracy_score(y_val, val_preds)
    return auc, accuracy

In [24]:
def run_experiment(model, feature_set, verbose=False, runs=100):
    training_seeds = np.random.randint(low=1, high=SEED, size=runs)
    
    all_auc = []
    all_accuracy = []
    for i, seed in enumerate(training_seeds):
        auc, accuracy = evaluate(model, feature_set, seed)
        all_auc.append(auc)
        all_accuracy.append(accuracy)
        if verbose:
            print(f'Run #{i+1:<3} [seed = {seed:<4}]: ' \
                  f'AUC = {auc:.4f}, Accuracy = {accuracy:.4f}')
        
    print(f'\nAverage over {runs} runs: ' \
          f'AUC = {np.mean(all_auc):.4f} +/- {np.std(all_auc):.4f}, ' \
          f'Accuracy = {np.mean(all_accuracy):.4f} +/- {np.std(all_accuracy):.4f}\n')

### 1. Model 1 + All features

In [25]:
run_experiment(model1, all_features, verbose=True, runs=5)

Run #1   [seed = 31  ]: AUC = 0.9754, Accuracy = 0.9303
Run #2   [seed = 1135]: AUC = 0.9752, Accuracy = 0.9231
Run #3   [seed = 1176]: AUC = 0.9777, Accuracy = 0.9240
Run #4   [seed = 1803]: AUC = 0.9764, Accuracy = 0.9249
Run #5   [seed = 1469]: AUC = 0.9807, Accuracy = 0.9344

Average over 5 runs: AUC = 0.9771 +/- 0.0020, Accuracy = 0.9274 +/- 0.0043



In [26]:
%%time
run_experiment(model1, all_features)


Average over 100 runs: AUC = 0.9783 +/- 0.0020, Accuracy = 0.9271 +/- 0.0043

CPU times: user 14.6 s, sys: 11.9 s, total: 26.5 s
Wall time: 22.7 s


### 2. Model 1 + Reduced features

In [27]:
%%time
run_experiment(model1, reduced_features)


Average over 100 runs: AUC = 0.9767 +/- 0.0021, Accuracy = 0.9248 +/- 0.0051

CPU times: user 9.56 s, sys: 7.01 s, total: 16.6 s
Wall time: 9.4 s


### 3. Model 1 + Top features

In [28]:
%%time
run_experiment(model1, top_features)


Average over 100 runs: AUC = 0.9740 +/- 0.0024, Accuracy = 0.9200 +/- 0.0052

CPU times: user 5.22 s, sys: 4.16 s, total: 9.38 s
Wall time: 4.92 s


### 4. Model 2 + All features

In [29]:
%%time
run_experiment(model2, all_features)


Average over 100 runs: AUC = 0.9771 +/- 0.0020, Accuracy = 0.9238 +/- 0.0052

CPU times: user 10.3 s, sys: 12.6 s, total: 22.9 s
Wall time: 12.1 s


### 5. Model 2 + Reduced features

In [30]:
%%time
run_experiment(model2, reduced_features)


Average over 100 runs: AUC = 0.9759 +/- 0.0024, Accuracy = 0.9223 +/- 0.0058

CPU times: user 8.87 s, sys: 12.1 s, total: 21 s
Wall time: 11.3 s


### 6. Model 2 + Top features

In [31]:
%%time
run_experiment(model2, top_features)


Average over 100 runs: AUC = 0.9736 +/- 0.0026, Accuracy = 0.9210 +/- 0.0051

CPU times: user 6.16 s, sys: 8.46 s, total: 14.6 s
Wall time: 7.61 s


### 7. Model 3 + All features

In [32]:
%%time
run_experiment(model3, all_features)


Average over 100 runs: AUC = 0.9871 +/- 0.0016, Accuracy = 0.9383 +/- 0.0058

CPU times: user 1min 48s, sys: 6.23 s, total: 1min 54s
Wall time: 1min 15s


### 8. Model 3 + Reduced features

In [33]:
%%time
run_experiment(model3, reduced_features)


Average over 100 runs: AUC = 0.9868 +/- 0.0015, Accuracy = 0.9383 +/- 0.0047

CPU times: user 1min 42s, sys: 5.66 s, total: 1min 48s
Wall time: 1min 19s


### 9. Model 3 + Top features

In [34]:
%%time
run_experiment(model3, top_features)


Average over 100 runs: AUC = 0.9860 +/- 0.0016, Accuracy = 0.9364 +/- 0.0047

CPU times: user 1min 34s, sys: 5.57 s, total: 1min 40s
Wall time: 1min 14s


# Summary

* We tried three ML models for binary classification: 
    1. Logistic Regression (LR)
    2. Principal Component Analysis + Logistic Regression pipeline (PCA-LR)
    3. RandomForestClassifier (RF)

* All three models show very small drop in accuracy (maximum of 0.7% for LR) even when feature set is reduced by 70%, while training time gets shorter. 
* PCA-LR shows consistent performance since components have variance in decreasing order so most of the information is captured by the first few components. RF also shows no significant drop in performance since it is able to utilize the informative features better, compared to a linear model like LR.
* Thus, we can prioritize working with a smaller feature set first and see if it is enough to outperform these models. The reduction in training time might be more valuable for fitting a complex model like a neural network as compared to these simple models.
* **When modeling with neural networks, we should be aiming to beat this baseline performance: AUC = ~0.987 and Accuracy = ~94%**