In [1]:
import pandas as pd

In [2]:
df_emails = pd.read_csv('Phishing_Email.csv')

In [3]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows

In [4]:
print(df_emails.shape)
df_emails.head(10)

(18650, 3)


Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email
5,5,global risk management operations sally congra...,Safe Email
6,6,"On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...",Safe Email
7,7,"entourage , stockmogul newsletter ralph velez ...",Phishing Email
8,8,"we owe you lots of money dear applicant , afte...",Phishing Email
9,9,re : coastal deal - with exxon participation u...,Safe Email


In [5]:
df_emails['Email Type'].value_counts()

Email Type
Safe Email        11322
Phishing Email     7328
Name: count, dtype: int64

In [8]:
df_emails['Email Text'].isna().sum()

16

# Data cleaning

In [9]:
# Drop the index column
df_emails = df_emails.drop(columns=['Unnamed: 0'])

# Drop rows with missing 'Email Text'
df_emails = df_emails.dropna(subset=['Email Text'])

# Standardize column names
df_emails.columns = ['text', 'label']

# Check updated shape and label distribution
df_emails.shape, df_emails['label'].value_counts()

((18634, 2),
 label
 Safe Email        11322
 Phishing Email     7312
 Name: count, dtype: int64)

In [11]:
df_emails.head(3)


Unnamed: 0,text,label
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email


# EDA

In [14]:
# Add word count column
df_emails['word_count'] = df_emails['text'].apply(lambda x: len(x.split()))

# Average word count
print(df_emails.groupby('label')['word_count'].mean().round(2))


label
Phishing Email    301.85
Safe Email        685.87
Name: word_count, dtype: float64


- Safe Emails tend to be twice as long as Phishing.

In [15]:
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

# Run this once to download stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Tokenization function
def tokenize_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Lowercase, remove punctuation
    words = text.split()
    return [word for word in words if word not in stop_words]

# Apply to both phishing and safe emails
phishing_words = df_emails[df_emails['label'] == 'Phishing Email']['text'].apply(tokenize_text)
safe_words = df_emails[df_emails['label'] == 'Safe Email']['text'].apply(tokenize_text)

# Flatten lists
phishing_flat = [word for sublist in phishing_words for word in sublist]
safe_flat = [word for sublist in safe_words for word in sublist]

# Get most common 20 words
phishing_common = Counter(phishing_flat).most_common(20)
safe_common = Counter(safe_flat).most_common(20)

print("Phishing Common Words:\n", phishing_common)
print("\nSafe Email Common Words:\n", safe_common)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/riadanas/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Phishing Common Words:
 [('email', 6618), ('free', 4329), ('com', 3976), ('get', 3871), ('company', 3863), ('information', 3846), ('please', 3827), ('money', 3794), ('business', 3575), ('one', 3572), ('us', 3544), ('time', 3194), ('e', 3154), ('report', 3122), ('http', 3120), ('new', 3032), ('click', 2938), ('make', 2739), ('order', 2687), ('mail', 2588)]

Safe Email Common Words:
 [('enron', 19271), ('university', 17181), ('language', 16843), ('one', 11386), ('subject', 11042), ('ect', 11005), ('would', 10834), ('information', 10817), ('email', 10671), ('new', 10338), ('please', 9925), ('linguistics', 8908), ('e', 8708), ('also', 8440), ('conference', 8019), ('may', 7704), ('de', 7468), ('com', 7063), ('papers', 6891), ('like', 6878)]


- We can spot here a few interesting keywords for Phishing:
- free, money, click, order

# Text processing

In [16]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return ' '.join([word for word in text.split() if word not in stop_words])

df_emails['clean_text'] = df_emails['text'].apply(clean_text)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df_emails['clean_text'])

y = df_emails['label'].apply(lambda x: 1 if x == 'Phishing Email' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)


(14907, 3000) (3727, 3000)


In [17]:
df_emails.head(10)

Unnamed: 0,text,label,word_count,clean_text
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,230,disc uniformitarianism sex lang dick hudson ob...
1,the other side of * galicismos * * galicismo *...,Safe Email,91,side galicismos galicismo spanish term names i...
2,re : equistar deal tickets are you still avail...,Safe Email,305,equistar deal tickets still available assist r...
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,96,hello hot lil horny toy one dream open minded ...
4,software at incredibly low prices ( 86 % lower...,Phishing Email,91,software incredibly low prices lower drapery s...
5,global risk management operations sally congra...,Safe Email,592,global risk management operations sally congra...
6,"On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...",Safe Email,153,sun aug wintermute mentioned impression get re...
7,"entourage , stockmogul newsletter ralph velez ...",Phishing Email,1384,entourage stockmogul newsletter ralph velez ge...
8,"we owe you lots of money dear applicant , afte...",Phishing Email,142,owe lots money dear applicant review upon rece...
9,re : coastal deal - with exxon participation u...,Safe Email,359,coastal deal exxon participation project agree...


# Modeling and Eval

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Model Training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Safe", "Phishing"]))

Confusion Matrix:
 [[2121   88]
 [  57 1461]]

Classification Report:
               precision    recall  f1-score   support

        Safe       0.97      0.96      0.97      2209
    Phishing       0.94      0.96      0.95      1518

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



In [20]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Suggest hyperparameters
    c = trial.suggest_loguniform('C', 1e-3, 1e2)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = 'liblinear' if penalty == 'l1' else 'lbfgs'

    # Create model
    model = LogisticRegression(C=c, penalty=penalty, solver=solver, max_iter=1000)

    # Cross-validation F1 score
    score = cross_val_score(model, X_train, y_train, scoring='f1', cv=3).mean()
    return score

# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best params
print("Best params:", study.best_params)


[I 2025-06-15 18:38:42,514] A new study created in memory with name: no-name-9b656a86-1b04-4964-9578-2f5eb6933e83
  c = trial.suggest_loguniform('C', 1e-3, 1e2)
[I 2025-06-15 18:38:42,861] Trial 0 finished with value: 0.9522732165977968 and parameters: {'C': 1.8956110816685119, 'penalty': 'l2'}. Best is trial 0 with value: 0.9522732165977968.
  c = trial.suggest_loguniform('C', 1e-3, 1e2)
[I 2025-06-15 18:38:43,255] Trial 1 finished with value: 0.9546603207069362 and parameters: {'C': 3.7664140497808507, 'penalty': 'l2'}. Best is trial 1 with value: 0.9546603207069362.
  c = trial.suggest_loguniform('C', 1e-3, 1e2)
[I 2025-06-15 18:38:43,583] Trial 2 finished with value: 0.9517503862349773 and parameters: {'C': 1.6613436076848902, 'penalty': 'l2'}. Best is trial 1 with value: 0.9546603207069362.
  c = trial.suggest_loguniform('C', 1e-3, 1e2)
[I 2025-06-15 18:38:43,638] Trial 3 finished with value: 0.0 and parameters: {'C': 0.00226797842998768, 'penalty': 'l1'}. Best is trial 1 with val

Best params: {'C': 6.55306160448068, 'penalty': 'l2'}


In [21]:
best_params = study.best_params
final_model = LogisticRegression(**best_params, max_iter=1000)
final_model.fit(X_train, y_train)
final_preds = final_model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, final_preds))
print("\nClassification Report:\n", classification_report(y_test, final_preds, target_names=["Safe", "Phishing"]))


Confusion Matrix:
 [[2114   95]
 [  34 1484]]

Classification Report:
               precision    recall  f1-score   support

        Safe       0.98      0.96      0.97      2209
    Phishing       0.94      0.98      0.96      1518

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.97      0.97      3727



- RECALL MATTERS MOST: 
- In phishing detection, missing a phishing email (false negative) is much worse than wrongly flagging a legit email (false positive).

# Testing

In [23]:
# Sample emails
sample_emails = [
    # 1 - Obvious phishing
    "Your PayPal account has been suspended. Click the link below to restore access immediately.",
    
    # 2 - Safe
    "Hi John, just confirming our lunch meeting next Tuesday at 1pm. Let me know if that still works.",
    
    # 3 - Subtle phishing
    "This is IT Support. Please upload your password file to the link below so we can verify access before the upgrade.",
    
    # 4 - Safe
    "Reminder: Your HR policy document is due for review. Please acknowledge by Friday.",
    
    # 5 - Obvious phishing
    "You’ve won a £1000 Amazon voucher! Click here to claim your reward before it expires!",
    
    # 6 - Safe
    "Team, the weekly report is attached. Let's review it briefly before the 10:30 meeting.",
    
    # 7 - Subtle phishing
    "Please update your payroll info to avoid delays. Use the secure form here: update-payroll-data.co",
    
    # 8 - Safe
    "Good morning, attached is the updated invoice for March. Let me know if you have any questions.",
    
    # 9 - Obvious phishing
    "Final warning: Your email access will be revoked. Click now to confirm your credentials.",
    
    # 10 - Subtle phishing
    "Dear user, due to unusual activity, we've restricted your access. Use the form to reinstate your email privileges."
]

# Clean them using the same clean_text() function
sample_cleaned = [clean_text(email) for email in sample_emails]

# Vectorize using the existing TF-IDF model
sample_vectors = vectorizer.transform(sample_cleaned)

# Predict using the trained logistic regression model
sample_preds = model.predict(sample_vectors)
sample_probs = model.predict_proba(sample_vectors)

# Display results
for text, pred, prob in zip(sample_emails, sample_preds, sample_probs):
    label = "Phishing" if pred == 1 else "Safe"
    confidence = round(max(prob), 2)
    print(f"\nEmail:\n{text}\n→ Prediction: {label} (Confidence: {confidence})")



Email:
Your PayPal account has been suspended. Click the link below to restore access immediately.
→ Prediction: Phishing (Confidence: 0.97)

Email:
Hi John, just confirming our lunch meeting next Tuesday at 1pm. Let me know if that still works.
→ Prediction: Safe (Confidence: 0.97)

Email:
This is IT Support. Please upload your password file to the link below so we can verify access before the upgrade.
→ Prediction: Safe (Confidence: 0.77)

Email:
Reminder: Your HR policy document is due for review. Please acknowledge by Friday.
→ Prediction: Safe (Confidence: 0.9)

Email:
You’ve won a £1000 Amazon voucher! Click here to claim your reward before it expires!
→ Prediction: Phishing (Confidence: 0.93)

Email:
Team, the weekly report is attached. Let's review it briefly before the 10:30 meeting.
→ Prediction: Safe (Confidence: 0.96)

Email:
Please update your payroll info to avoid delays. Use the secure form here: update-payroll-data.co
→ Prediction: Phishing (Confidence: 0.58)

Email:
G