In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('Phishing_Email.csv')

# Data Cleaning and Preprocessing
def clean_text(text):
    # Convert to string if not already
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

# Apply cleaning to email text
df['cleaned_text'] = df['Email Text'].apply(clean_text)

# Encode labels
df['label'] = df['Email Type'].apply(lambda x: 1 if x == 'Phishing Email' else 0)

# Exploratory Data Analysis
print("Class Distribution:")
print(df['Email Type'].value_counts())

plt.figure(figsize=(8, 5))
sns.countplot(x='Email Type', data=df)
plt.title('Class Distribution')
plt.show()

# Text Length Analysis
df['text_length'] = df['cleaned_text'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='text_length', hue='Email Type', bins=50)
plt.title('Distribution of Text Length by Class')
plt.show()

# Word Cloud Visualization
from wordcloud import WordCloud

phishing_text = ' '.join(df[df['Email Type'] == 'Phishing Email']['cleaned_text'])
safe_text = ' '.join(df[df['Email Type'] == 'Safe Email']['cleaned_text'])

plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
wordcloud = WordCloud(width=800, height=400).generate(phishing_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Phishing Emails Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
wordcloud = WordCloud(width=800, height=400).generate(safe_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Safe Emails Word Cloud')
plt.axis('off')
plt.show()

# Prepare data for modeling
X = df['cleaned_text']
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Karim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Karim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'Phishing_Email.csv'