<a href="https://colab.research.google.com/github/anna-asmaryan/NSBE-Hacks/blob/email_phishing/NSBE_Hacks_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import needed libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [3]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]

# Set the path to the files you'd like to load
file_paths = ["CEAS_08.csv", "Nazario.csv", "Nigerian_Fraud.csv", "SpamAssasin.csv"]

# Create empty dataframe with sender email, reciever, subject, body, and label
df = pd.DataFrame(columns=["sender", "receiver", "date", "subject", "body", "urls", "label"])

# Load the latest version
for file_path in file_paths:
  df_temp = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "naserabdullahalam/phishing-email-dataset",
    file_path,
    # Provide any additional arguments like
    # sql_query or pandas_kwargs. See the
    # documenation for more information:
    # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
  )
  df_temp = df_temp.loc[:, ["sender", "receiver", "date", "subject", "body", "urls", "label"]]

  df = pd.concat([df, df_temp])



Cleaning the dataset

In [4]:
# Check dimentions of dataset
print("First 5 records:\n", df.head(), "\n")
print("Dimensions:\n", df.shape, "\n")

# Check variables summaries
print("Variables:\n", df.describe(), "\n")
print("Variables types:\n", df.dtypes, "\n")

# Check for missing values
print("Missing values:\n", df.isnull().sum(), "\n")

First 5 records:
                                               sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             subject  \
0                          Never agree to

In [5]:
# Drop na values in body column
df = df.dropna(subset=['body'])

# Check for missing values
print("Missing values:\n", df.isnull().sum(), "\n")

Missing values:
 sender       331
receiver    2092
date         483
subject       87
body           0
urls           0
label          0
dtype: int64 



Begin vectorizing body data

In [6]:
# Tokenize the body
df['vector body'] = df['body'].apply(lambda x: word_tokenize(x.lower()))

# Remove stop words
stopset = set(stopwords.words('english'))
df['vector body'] = df['vector body'].apply(lambda x: [word for word in x if word not in stopset])

# Lemmatizing the body
lemmatizer = WordNetLemmatizer()
df['vector body'] = df['vector body'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Stemming the body
stemmer = PorterStemmer()
df['vector body'] = df['vector body'].apply(lambda x: [stemmer.stem(word) for word in x])

# Join tokens back into a string
df['vector body'] = df['vector body'].apply(lambda x: ' '.join(x))

# Vectorize the body with Count adn Tfidf vector versions
cv = CountVectorizer()
cv_matrix = cv.fit_transform(df['vector body'])
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['vector body'])

In [34]:
# Splitting to training, validating, and testing split
X_train, X_test_val, y_train, y_test_val = train_test_split(tfidf_matrix, df['label'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.25, random_state=42)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

X_train shape: (29915, 412442)
X_val shape: (14958, 412442)
X_test shape: (4986, 412442)
