In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset
df = pd.read_csv('phishing_email.csv')  # Update path if needed

# Use correct column names: 'text_combined' for email text and 'label' for labels
X = df['text_combined'].str.lower()
y = df['label']

# Feature extraction
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X_vec = vectorizer.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Models - using CPU-based algorithms only (no GPU required)
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC(),
    # 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)  # Commented out
}

results = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {
        'accuracy': acc,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score']
    }
    print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred))

# Display results and best model
best_model = max(results, key=lambda x: results[x]['accuracy'])
print("\nModel Comparison Results:")
for name, metrics in results.items():
    print(f"{name}: Accuracy={metrics['accuracy']:.4f}, Precision={metrics['precision']:.4f}, Recall={metrics['precision']:.4f}, F1-Score={metrics['f1-score']:.4f}")

print(f"\nBest performing model based on accuracy: {best_model}")
print("\nNote: These models run on CPU only - no GPU required!")



Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      7935
           1       0.99      0.98      0.99      8563

    accuracy                           0.99     16498
   macro avg       0.99      0.99      0.99     16498
weighted avg       0.99      0.99      0.99     16498


SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      7935
           1       0.99      1.00      0.99      8563

    accuracy                           0.99     16498
   macro avg       0.99      0.99      0.99     16498
weighted avg       0.99      0.99      0.99     16498


Model Comparison Results:
Naive Bayes: Accuracy=0.9872, Precision=0.9872, Recall=0.9872, F1-Score=0.9872
SVM: Accuracy=0.9918, Precision=0.9918, Recall=0.9918, F1-Score=0.9918

Best performing model based on accuracy: SVM

Note: These models run on CPU only - no GPU required!


In [None]:
# Save the best model and vectorizer
import joblib
import os
from pathlib import Path

# Get the best model object
best_model_obj = models[best_model]

# Create backend/models directory if it doesn't exist
backend_models_dir = Path("backend/models")
backend_models_dir.mkdir(parents=True, exist_ok=True)

# Save to both locations (root and backend/models)
# Root location (for reference)
joblib.dump(best_model_obj, 'best_phishing_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Backend location (for deployment)
joblib.dump(best_model_obj, backend_models_dir / 'best_phishing_model.pkl')
joblib.dump(vectorizer, backend_models_dir / 'tfidf_vectorizer.pkl')

print(f"\n✓ Model saved successfully!")
print(f"  Root directory:")
print(f"    - best_phishing_model.pkl ({best_model})")
print(f"    - tfidf_vectorizer.pkl")
print(f"\n  Backend directory (ready for deployment):")
print(f"    - backend/models/best_phishing_model.pkl")
print(f"    - backend/models/tfidf_vectorizer.pkl")
print(f"\n  Best model: {best_model}")
print(f"  Model type: {type(best_model_obj).__name__}")

# To load later, use:
# loaded_model = joblib.load('best_phishing_model.pkl')
# loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')


✓ Model saved successfully!
  Root directory:
    - best_phishing_model.pkl (SVM)
    - tfidf_vectorizer.pkl

  Backend directory (ready for deployment):
    - backend/models/best_phishing_model.pkl
    - backend/models/tfidf_vectorizer.pkl

  Best model: SVM
  Model type: LinearSVC
