In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load dataset
df = pd.read_csv(r"C:\Users\anish\Downloads\Phishing_Email (1).csv")

# Drop index column if present
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

# Rename columns for simplicity
df.columns = ['text', 'label']

# Drop rows with missing text
df = df.dropna(subset=['text'])

# Fix the label mapping - swap the values
label_mapping = {'phishing': 0, 'safe': 1}  # Reversed from original
if df['label'].dtype == 'object':
    df['label'] = df['label'].map(label_mapping)

# Add debugging to verify labels are correct
print("Label distribution after mapping:")
print(df['label'].value_counts())
print("\nSample data:")
print(df[['text', 'label']].head())

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Vectorize email text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['phishing', 'safe'])

print(f"\nAccuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

# Test prediction probabilities to verify they make sense
sample_predictions = model.predict_proba(X_test_tfidf)[:5]
sample_texts = X_test.iloc[:5].values
sample_labels = y_test.iloc[:5].values

print("\nSample predictions (first 5 test emails):")
for i in range(5):
    text_preview = sample_texts[i][:100] + "..." if len(sample_texts[i]) > 100 else sample_texts[i]
    actual_label = "phishing" if sample_labels[i] == 0 else "safe"
    phishing_conf = sample_predictions[i][0]  # Confidence for phishing (class 0)
    safe_conf = sample_predictions[i][1]      # Confidence for safe (class 1)

    print(f"\nEmail {i+1}: {text_preview}")
    print(f"Actual: {actual_label}")
    print(f"Phishing confidence: {phishing_conf:.3f}")
    print(f"Safe confidence: {safe_conf:.3f}")

# Save model and vectorizer
joblib.dump(model, 'phishing_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("\nModel and vectorizer saved successfully!")

Label distribution after mapping:
label
1    11299
0     7308
Name: count, dtype: int64

Sample data:
                                                text  label
0  re : 6 . 1100 , disc : uniformitarianism , re ...      1
1  the other side of * galicismos * * galicismo *...      1
2  re : equistar deal tickets are you still avail...      1
3  \nHello I am your hot lil horny toy.\n    I am...      0
4  software at incredibly low prices ( 86 % lower...      0

Accuracy: 0.9699
Classification Report:
              precision    recall  f1-score   support

    phishing       0.95      0.98      0.96      1487
        safe       0.98      0.97      0.97      2235

    accuracy                           0.97      3722
   macro avg       0.97      0.97      0.97      3722
weighted avg       0.97      0.97      0.97      3722


Sample predictions (first 5 test emails):

Email 1: adv : " free download " register your web site on over 800 + search engines " instanly " this is a r...
Actual: phish

In [13]:
import tkinter as tk
from tkinter import messagebox
import joblib

# Load model and vectorizer
try:
    model = joblib.load('phishing_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
except Exception as e:
    print(f"Error loading model/vectorizer: {e}")
    exit(1)

def predict_phishing():
    email_text = entry.get()
    if not email_text.strip():
        messagebox.showwarning("Input Error", "Please enter some email text.")
        return

    try:
        email_tfidf = vectorizer.transform([email_text])
        prediction = model.predict(email_tfidf)[0]
        proba = model.predict_proba(email_tfidf)[0]

        label = "Phishing Email" if prediction == 1 else "Safe Email"
        confidence_safe = proba[0]
        confidence_phishing = proba[1]

        result_text = (f"Prediction: {label}\n"
                       f"Confidence - Safe: {confidence_safe:.2f}, Phishing: {confidence_phishing:.2f}")
        result_label.config(text=result_text)
    except Exception as err:
        messagebox.showerror("Prediction Error", f"An error occurred:\n{err}")

# Setup tkinter window
root = tk.Tk()
root.title("Phishing Email Detector - Quick Test")

tk.Label(root, text="Enter email text to test:").pack(pady=5)
entry = tk.Entry(root, width=80)
entry.pack(pady=5)

btn = tk.Button(root, text="Predict", command=predict_phishing)
btn.pack(pady=10)

result_label = tk.Label(root, text="", fg="blue", font=("Arial", 12))
result_label.pack(pady=5)

root.mainloop()
