In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load dataset
df = pd.read_csv(r"C:\Users\anish\Downloads\Phishing_Email.csv\Phishing_Email.csv")

# Drop index column if present
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

# Rename columns for simplicity
df.columns = ['text', 'label']

# Drop rows with missing text
df = df.dropna(subset=['text'])

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Vectorize email text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

joblib.dump(model, 'phishing_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

Accuracy: 0.9657
Classification Report:
                precision    recall  f1-score   support

Phishing Email       0.95      0.97      0.96      1518
    Safe Email       0.98      0.96      0.97      2209

      accuracy                           0.97      3727
     macro avg       0.96      0.97      0.96      3727
  weighted avg       0.97      0.97      0.97      3727



['tfidf_vectorizer.pkl']