In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

nltk.download('stopwords')


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

# STEP 1: DATA

In [7]:
df = pd.read_csv("dataset/phishing_email.csv")
print(df.head())

# check columns
if "text_combined" not in df.columns or "label" not in df.columns:
    raise ValueError("ckeck dataset it have a 'text_combined' และ 'label'")

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/phishing_email.csv'

In [4]:
df.info()

NameError: name 'df' is not defined

In [None]:
df.columns

# STEP 2: DATA PREPROCESSING

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

#clean data
df["cleaned_text"] = df["text_combined"].apply(clean_text)

In [None]:
df = df.dropna()

In [None]:
# Dropping N/A Values
print(df.isnull().sum())

In [None]:
# Dropping Duplicates
df = df.drop_duplicates()

In [None]:
# Balance Dataset
print(df['label'].value_counts())
phishing_emails = df[df['label'] == 1]
non_phishing_emails = df[df['label'] == 0]

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)  # ใช้ 5000 คำที่สำคัญที่สุด
X = vectorizer.fit_transform(df["cleaned_text"]).toarray()
y = np.array(df["label"])  # แปลง Labels เป็น NumPy Array

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape,X_train.dtype)

# WordCloud

In [None]:
from wordcloud import WordCloud
phishing_emails = df[df['label'] == 1]['text_combined']
non_phishing_emails = df[df['label'] == 0]['text_combined']

In [None]:
def plot_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=15)
    plt.axis('off')
    plt.show()

In [None]:
phishing_text = df[df['label'] == 1]['text_combined']
non_phishing_text = df[df['label'] == 0]['text_combined']

plot_wordcloud(phishing_text, "Most Common Words in Phishing Emails")

In [None]:
plot_wordcloud(non_phishing_text, "Most Common Words in Non-Phishing Emails")

# ANN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
# สร้างโมเดล
model = Sequential([
    Dense(1024, kernel_regularizer=l2(0.001), input_shape=(5000,)),  # เพิ่มหน่วย
    LeakyReLU(alpha=0.01),
    BatchNormalization(),
    Dropout(0.5),

    Dense(512, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.01),
    BatchNormalization(),
    Dropout(0.4),

    Dense(256, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.01),
    BatchNormalization(),
    Dropout(0.4),

    Dense(128, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.01),
    BatchNormalization(),
    Dropout(0.4),

    Dense(1, activation='sigmoid')
])

# คอมไพล์โมเดล
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# ดูโครงสร้างโมเดล
model.summary()

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
]

# 🚀 ฝึกโมเดล
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32, callbacks=callbacks)


# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

# predict
y_pred = (model.predict(X_test) > 0.5).astype(int)

# display Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Normal", "Social Engineering"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

print(classification_report(y_test, y_pred, target_names=["Normal", "Social Engineering"]))


# Precision-Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve, auc

# คำนวณค่า Precision-Recall
y_prob = model.predict(X_test).ravel()
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# คำนวณ AUC-PR
pr_auc = auc(recall, precision)

plt.plot(recall, precision, color='green', label=f'AUC = {pr_auc:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.grid(True)
plt.show()


# Learning Curve (Train vs Validation Loss/Accuracy)

In [None]:
#Display Learning Curve
plt.figure(figsize=(12, 5))

# Loss Curve
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.title('Learning Curve: Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Accuracy Curve
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy', color='blue')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='red')
plt.title('Learning Curve: Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


# Save Model

In [None]:
import pickle
from tensorflow.keras.models import load_model

# Save the trained model as HDF5
model.save("phishing_email_model.h5")
model = load_model("phishing_email_model.h5", compile=False)

# Save the TfidfVectorizer using pickle
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ Model and vectorizer saved successfully!")


# Test Model

In [None]:
def predict_email(text):
    cleaned_text = clean_text(text)
    text_vector = vectorizer.transform([cleaned_text]).toarray()
    prediction = model.predict(text_vector)[0][0]
    return "🚨 Social Engineering Detected" if prediction > 0.5 else "✅ Normal Message"

In [None]:
import random

test_emails = [
    # Social Engineering (Label = 1)
    ("URGENT: Your bank account has been locked due to suspicious activity. Verify now!", 1),
    ("Dear customer, we've noticed an issue with your credit card. Click here to fix it!", 1),
    ("Security Alert: Your password will expire today. Reset now to avoid account suspension.", 1),
    ("This is Apple Support. Someone attempted to log in from an unknown device. Secure your account now.", 1),
    ("Your PayPal account is under review due to unusual activity. Verify your identity now.", 1),
    ("Congratulations! You won a free vacation. Claim your reward before it expires!", 1),
    ("We've detected fraudulent activity on your account. Click below to verify and stay protected.", 1),
    ("Your Netflix subscription has been suspended. Update your billing info to reactivate.", 1),
    ("Warning: Your email storage is full. Upgrade now to continue receiving emails.", 1),
    ("Get a free iPhone 15! Limited-time offer, claim yours now.", 1),
    ("Confirm your identity to prevent your Facebook account from being disabled.", 1),
    ("Your Amazon order has been flagged as suspicious. Verify your purchase immediately.", 1),
    ("Final Notice: Your car insurance is expiring soon. Renew now to avoid penalties.", 1),
    ("Your Gmail account is scheduled for deactivation. Click here to cancel the request.", 1),
    ("We noticed login attempts from a new location. Secure your email account now.", 1),
    ("Dear user, your social security number has been compromised. Protect it now!", 1),
    ("Special offer: Exclusive $500 Walmart gift card for lucky users! Claim now.", 1),
    ("Your tax refund is ready! Submit your bank details to receive your payment.", 1),
    ("Important: Your medical insurance is inactive. Reactivate now to stay covered.", 1),
    ("You have received a secure message from your bank. Click here to read it.", 1),
    ("Your Twitter account has been reported for violating policies. Verify now to avoid suspension.", 1),
    ("Warning: Someone accessed your Dropbox files. Reset your password immediately.", 1),
    ("Dear customer, unauthorized transactions detected in your account. Secure it now.", 1),
    ("Your Apple ID has been locked for security reasons. Restore access here.", 1),
    ("Bank Alert: Your last transaction was declined. Confirm your card details now.", 1),
    ("Your Uber payment has failed. Update your billing details to continue using our service.", 1),
    ("Exclusive offer: Work from home and earn $500/day! Sign up now.", 1),
    ("We have a delivery for you but need address confirmation. Click here to update.", 1),
    ("Dear user, we noticed unusual purchases on your eBay account. Verify transactions now.", 1),
    ("Your health insurance plan needs urgent verification. Click here to confirm.", 1),
    ("Your flight booking is incomplete. Pay now to secure your seat.", 1),
    ("Suspicious login attempt detected on your Spotify account. Verify if this was you.", 1),
    ("Your cryptocurrency wallet is at risk! Secure your funds now.", 1),
    ("Your Windows license is about to expire. Renew it immediately.", 1),
    ("Government grant available! Claim up to $10,000 today. Apply now.", 1),
    ("You are eligible for a special loan offer. Pre-approve your application now.", 1),
    ("Security warning: Your system has been infected with a virus. Scan now!", 1),
    ("Urgent: Confirm your bank transfer before it gets canceled.", 1),
    ("Attention: We need to verify your Amazon Prime subscription. Click here.", 1),
    ("Limited offer: Get a free gift card for completing this short survey.", 1),
    ("Your online banking password has been reset. Click to set a new one.", 1),
    ("A package addressed to you is pending verification. Confirm now.", 1),
    ("Your social media account was flagged for suspicious activity. Secure it now.", 1),
    ("We are updating our privacy policy. Accept the new terms to continue using our service.", 1),
    ("A suspicious withdrawal was detected from your bank account. Verify now.", 1),
    ("Your LinkedIn profile is at risk. Confirm your login to avoid suspension.", 1),
    ("Congratulations! You've been selected for a mystery shopping job. Apply now!", 1),
    ("Alert: Your mobile carrier account has been accessed from an unknown location.", 1),
    ("Immediate action required: Update your online banking profile to avoid service disruption.", 1),

    # Normal Messages (Label = 0)
    ("Hey, just checking in. How have you been?", 0),
    ("Reminder: Our meeting is scheduled for tomorrow at 2 PM.", 0),
    ("Your package has been shipped! Track your order here.", 0),
    ("Join us for a webinar on AI trends next week.", 0),
    ("Hope you're doing well. Let's catch up this weekend!", 0),
    ("The weather looks great for our hike on Saturday!", 0),
    ("Don't forget about our lunch meeting at 1 PM.", 0),
    ("Happy birthday! Hope you have a fantastic day!", 0),
    ("Your flight itinerary has been updated. Check your email for details.", 0),
    ("Great job on your presentation today!", 0),
    ("We have a new product launch next month. Stay tuned!", 0),
    ("Can you review the report before the deadline?", 0),
    ("Let's plan a team dinner next week.", 0),
    ("Your appointment is confirmed for Monday at 10 AM.", 0),
    ("Please send me the updated project files.", 0),
    ("Reminder: Your gym membership renewal is due next week.", 0),
    ("Looking forward to your feedback on the proposal.", 0),
    ("Meeting rescheduled to 3 PM. Let me know if that works.", 0),
    ("Here's the recipe you asked for. Hope you like it!", 0),
    ("Enjoy a 20% discount on our new collection!", 0),
    ("Thanks for your support! We appreciate your feedback.", 0),
    ("Movie night this Friday? Let me know!", 0),
    ("Your dinner reservation is confirmed.", 0),
    ("Hope you're having a great day!", 0),
]

# สุ่มเรียงข้อมูลใหม่
random.shuffle(test_emails)

# ทดสอบโมเดล
correct_predictions = 0
total_samples = len(test_emails)

for email, true_label in test_emails:
    prediction_text = predict_email(email)  # เรียกใช้ฟังก์ชันที่มีอยู่
    predicted_label = 1 if "🚨" in prediction_text else 0  # แปลงผลลัพธ์เป็น 1 หรือ 0

    if predicted_label == true_label:
        correct_predictions += 1

# คำนวณ % ความถูกต้อง
accuracy = (correct_predictions / total_samples) * 100

# แสดงผลลัพธ์
print(f"Total Samples: {total_samples}")
print(f"Correct Predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}%")