### Data
URL dataset: https://www.kaggle.com/datasets/ethancratchley/email-phishing-dataset?resource=download
### Libraries

In [104]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

### Read Dataset

In [105]:
df = pd.read_csv('../data/email_phishing_data.csv')

### Analysis

In [106]:
#Quick check
df.head()

Unnamed: 0,num_words,num_unique_words,num_stopwords,num_links,num_unique_domains,num_email_addresses,num_spelling_errors,num_urgent_keywords,label
0,140,94,52,0,0,0,0,0,0
1,5,5,1,0,0,0,0,0,0
2,34,32,15,0,0,0,0,0,0
3,6,6,2,0,0,0,0,0,0
4,9,9,2,0,0,0,0,0,0


In [107]:
print("Data shape:", df.shape)

Data shape: (524846, 9)


In [108]:
print("Data types:", df.dtypes)

Data types: num_words              int64
num_unique_words       int64
num_stopwords          int64
num_links              int64
num_unique_domains     int64
num_email_addresses    int64
num_spelling_errors    int64
num_urgent_keywords    int64
label                  int64
dtype: object


In [109]:
print("Estadistics:", df.describe())

Estadistics:           num_words  num_unique_words  num_stopwords      num_links  \
count  5.248460e+05     524846.000000  524846.000000  524846.000000   
mean   2.762280e+02        123.012167      80.045465       0.895588   
std    3.335457e+03        201.626478    1023.330380       5.603001   
min    0.000000e+00          0.000000       0.000000       0.000000   
25%    4.700000e+01         38.000000      12.000000       0.000000   
50%    1.200000e+02         79.000000      34.000000       0.000000   
75%    2.690000e+02        145.000000      79.000000       0.000000   
max    2.339682e+06      51251.000000  720411.000000     824.000000   

       num_unique_domains  num_email_addresses  num_spelling_errors  \
count       524846.000000        524846.000000        524846.000000   
mean             0.347767             2.114897            24.694731   
std              1.774209            13.592682           311.312358   
min              0.000000             0.000000             0.00

### Prepocessing

In [110]:
# Reduce the number of rows by 50% for faster processing
df = df.sample(frac=0.5, random_state=42)
print("Actual num of rows:", df.shape[0])

y = df["label"]
X = df.drop(["label"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Actual num of rows: 262423


### Pipeline

Random Forest is a tree-based model and hence does not require feature scaling.

In [93]:
pipeline = Pipeline(steps=[
    ('classifier', RandomForestClassifier(random_state=42))
])

### Optimizacion

In [94]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 15, 30],
    'classifier__min_samples_split': [2, 10],
    'classifier__max_features': ['sqrt', None]
}

### Training

In [95]:
grid_search_rf = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

### Best Model Selection 🏆

Main features

In [96]:
best_params = grid_search_rf.best_params_
best_model = grid_search_rf.best_estimator_
coefficients = best_model.named_steps['classifier'].feature_importances_
feature_names = X_train.columns

# Top features graph
plt.figure(figsize=(9, 6))
top_features = np.argsort(coefficients)[::-1]
importances = coefficients[top_features]
names = np.array(feature_names)[top_features]

bars = plt.barh(names, importances, color=plt.cm.Blues(importances/max(importances)))
plt.xlabel('Importance')
plt.title('Features by Importance')
plt.tight_layout()
plt.savefig("feature_importance.png", dpi=120)
plt.close()

ROC Curve

In [97]:
# Obtener las probabilidades para la clase positiva
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC (AUC = {auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
# Export image
plt.savefig("roc_curve.png", dpi=120)
plt.close()

Confusion Matrix

In [98]:
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Phishing', 'Phishing'])
disp.plot(ax=ax, cmap=plt.cm.Blues, colorbar=False, 
          values_format='d', xticks_rotation=0)

plt.title('Confusion Matrix', fontsize=14)
plt.xlabel('Prediction', fontsize=12)
plt.ylabel('Real Value', fontsize=12)

# Export image
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=120)
plt.close()

In [None]:
model_path = "classification_model.joblib"
joblib.dump(grid_search_rf, model_path)

print(f"Modelo guardado en: {model_path}")
