In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [2]:
# Load data (source: https://huggingface.co/datasets/zefang-liu/phishing-email-dataset)
df = pd.read_csv("Phishing_Email.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Phishing_Email.csv'

In [None]:
df.shape

In [None]:
df.columns

In [None]:

# Check column names
print("\n📋 Columns:", df.columns.tolist())

# Check for missing values
print("\n🔍 Missing values:\n", df.isnull().sum())

# Check class distribution
print("\n⚖️ Class distribution:")
print(df['Email Type'].value_counts(normalize=True) * 100)

# Check for duplicates
print("\n🔍 Duplicated rows:", df.duplicated().sum())

# Check unique ID consistency
print("\n🔎 Unique Email No.:", df['Unnamed: 0'].nunique(), "/", len(df))

In [None]:
# Optional: visualize class distribution
plt.figure(figsize=(4,4))
df['Email Type'].value_counts().plot(kind='bar', title='Class Distribution (Label)')
plt.xticks(ticks=[0, 1], labels=['Not phishing (0)', 'Phishing (1)'], rotation=0)
plt.ylabel("Count")
plt.show()

In [None]:
# Remove rows where Email Text is missing
df_clean = df.dropna(subset=['Email Text'])
# Remove rows where Email Text is empty or only whitespace (incl. \n, \t)
df_clean = df_clean[df_clean['Email Text'].str.strip().astype(bool)]

In [None]:
df_clean.isnull().sum()

In [None]:

#  Define features (X) and target (y)
X = df_clean[['Email Text']]  #  text as features
y = df_clean['Email Type']   # target for stratification


In [None]:

# Perform 70-30 stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)


In [None]:
# Combine X and y for convenient inspection
train_df = X_train.copy()
train_df['Label'] = y_train

test_df = X_test.copy()
test_df['Label'] = y_test

print("\nFinal splits:")
print("Train:", train_df.shape)
print("Test:", test_df.shape)



In [None]:
# Save files
train_df.to_csv("Data/train.csv", index=False)
test_df.to_csv("Data/test.csv", index=False)