# EXPLORATORY DATA ANALYSIS
* Performing EDA on dataset_phising.csv with the help of ydata_profiling library

In [4]:
import pandas as pd
from ydata_profiling import ProfileReport
df= pd.read_csv('dataset_phishing.csv')
profile= ProfileReport(df, title="Phising Report", minimal=True)
profile.to_notebook_iframe()
profile.to_file("analysis_report.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 89/89 [00:00<00:00, 181.70it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Clean and prepare the code for machine learning models
* A majority of the fields are numeric, which is perfect for this use case. However there are two that are not, "url" and "status".

* url is not needed for modeling purposes so we can go ahead and drop that field.

* The status field is text, but it only contains one of two values "legitimate" or "phishing". To solve this issue we will encode these values numerically.

* legitimate will equal 0.

* phishing will equal 1

In [5]:
#import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#df= pd.read_csv('dataset_phishing.csv')
df = df.drop("url", axis=1)
print("'url' column dropped")

label_encoder = LabelEncoder()
df["status_encoding"] = label_encoder.fit_transform(df["status"])
status_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"Status column encoded. Mapping {status_mapping}")

df = df.drop("status", axis=1)
print("Original 'status' column dropped.")

print("\nHandling -1 values in 'domain_age' and 'domain_registration_length'...")

median_domain_age = df[df['domain_age'] != -1]['domain_age'].median()
median_domain_registration_length = df[df['domain_registration_length'] != -1]['domain_registration_length'].median()

df['domain_age'] = df['domain_age'].replace(-1, median_domain_age)
df['domain_registration_length'] = df['domain_registration_length'].replace(-1, median_domain_registration_length)

print(f"Replaced -1 in 'domain_age' with median: {median_domain_age}")
print(f"Replaced -1 in 'domain_registration_length' with median: {median_domain_registration_length}")

X = df.drop('status_encoding', axis=1)
y = df['status_encoding']

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set features (X_train) shape: {X_train.shape}")
print(f"Testing set features (X_test) shape: {X_test.shape}")
print(f"Training set target (y_train) shape: {y_train.shape}")
print(f"Testing set target (y_test) shape: {y_test.shape}")

'url' column dropped
Status column encoded. Mapping {'legitimate': 0, 'phishing': 1}
Original 'status' column dropped.

Handling -1 values in 'domain_age' and 'domain_registration_length'...
Replaced -1 in 'domain_age' with median: 5056.0
Replaced -1 in 'domain_registration_length' with median: 245.0

Features (X) shape: (11430, 87)
Target (y) shape: (11430,)

Training set features (X_train) shape: (9144, 87)
Testing set features (X_test) shape: (2286, 87)
Training set target (y_train) shape: (9144,)
Testing set target (y_test) shape: (2286,)


In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

for name, model in models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)    

    print(f"\n Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



 Model: Logistic Regression
Accuracy: 0.9352580927384077
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      1143
           1       0.94      0.94      0.94      1143

    accuracy                           0.94      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.94      0.94      0.94      2286

Confusion Matrix:
 [[1069   74]
 [  74 1069]]

 Model: Random Forest
Accuracy: 0.9593175853018373
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      1143
           1       0.95      0.96      0.96      1143

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286

Confusion Matrix:
 [[1091   52]
 [  41 1102]]

 Model: SVM
Accuracy: 0.9505686789151356
Classification Report:
               precision    rec

# Refining the model
* The following cell will be used to use hyperparameter tuning and GridSearchCV to find the best model that will provide the most accurate results.

In [9]:
from sklearn.model_selection import GridSearchCV

print("Starting Hyperparameter Tuning with GridSearchCV...")

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_features": ["sqrt", "log2"],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="f1",
    n_jobs=1,
    verbose=2)

print("Running GridSearchCV, this might take some time...")
grid_search.fit(X_train, y_train)
print("GridSearchCV complete.")

print("\n--- Best Parameters and Score from GridSearchCV ---")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best F1-Score found on training data (cross-validated): {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

accuracy_best = accuracy_score(y_test, y_pred_best)
classification_best = classification_report(y_test, y_pred_best)

conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"Accuracy: {accuracy_best:.4f}")
#print(f"Precision: {classification_best:.4f}")

print("\nDetailed Classification Report:")
print(classification_best) 

print("\nConfusion Matrix:")
print(conf_matrix_best)

print("\n--- Confusion Matrix Breakdown ---")
TN, FP, FN, TP = conf_matrix_best.ravel()
print(f"True Negatives (Legitimate Correctly Classified): {TN}")
print(f"False Positives (Legitimate Classified as Phishing): {FP}")
print(f"False Negatives (Phishing Classified as Legitimate): {FN}")
print(f"True Positives (Phishing Correctly Classified): {TP}")

print("\n--- Hyperparameter Tuning Complete ---")

Starting Hyperparameter Tuning with GridSearchCV...
Running GridSearchCV, this might take some time...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 

In [11]:

import joblib

print("--- Saving the Best Trained scikit-learn Model for Future Python Use ---")

scikit_learn_model_filename = 'phishing_detection.joblib'

joblib.dump(best_model, scikit_learn_model_filename)

print(f"Your optimized scikit-learn model saved successfully as '{scikit_learn_model_filename}'")


--- Saving the Best Trained scikit-learn Model for Future Python Use ---
Your optimized scikit-learn model saved successfully as 'phishing_detection.joblib'
