In [None]:
# notes that will save your time 
to save a model i train
import joblib

# Saving the model to not retrain later
joblib.dump(svm_1, "svm_1_v1.pkl")

# Load the model later
loaded_model = joblib.load("svm_1_v1.pkl")

# Make predictions with the loaded model
y_pred = loaded_model.predict(X1_test)
# to save tfidf vectorizations:
from scipy import sparse
import numpy as np
sparse.save_npz("./vectorizations/tfidf_v1/X1_train_tfidf.npz", X1_train)
sparse.save_npz("./vectorizations/tfidf_v1/X1_test_tfidf.npz", X1_test)
sparse.save_npz("./vectorizations/tfidf_v1/X1_val_tfidf.npz", X1_val)

# --- Save labels as numpy arrays ---
np.save("./vectorizations/tfidf_v1/y1_train_tfidf.npy", y1_train.to_numpy())
np.save("./vectorizations/tfidf_v1/y1_test_tfidf.npy", y1_test.to_numpy())
np.save("./vectorizations/tfidf_v1/y1_val_tfidf.npy", y1_val.to_numpy())





In [None]:
#personal issue with cuda..leave for me later
#i ran these commands:
#pip uninstall torch torchvision torchaudio -y -v
#pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 -v
#took some time and it worked, make sure the cuda version matches your gpu
import torch
# Verify CUDA is available
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

**FIRST: Importing The Dataset**

We plan to use the exact same training, testing and validation elements. so we will import them from the same exact csv files to ensure fair and accurate comparison between the models.

In [1]:
import pandas as pd

train_df = pd.read_csv("../split_dataset/train_split.csv")
test_df = pd.read_csv("../split_dataset/test_split.csv")
val_df = pd.read_csv("../split_dataset/val_split.csv")

# Quick checks
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Validation shape:", val_df.shape)



Train shape: (126035, 4)
Test shape: (42012, 4)
Validation shape: (42012, 4)


**Support Vector Machines (SVM)** 

**Preprocessing Method #1: TF-IDF v1**

TF-IDF is one of the classic vectorization techniques when it comes to text. 
It is simple and fast, computationally cheaper, interpretable; as in you can see which words have the highest weight for a document.It can also reduce the impact of common words.
some cons to using TF-IDF: it ignores word order and context, which can heavily affect accuracy when it comes to predicting phishing/ spam emails. it can be sparse and high dimensional, for example, if we have 200k unique words, each email is a 200k dimensional sparse vector. which can be crazy to use for training, and storing in a databse.


`max_features` controls the vocabulary size, if it's too small it can lead to losing important information on some words, if it's too large we have the risk of overfitting and longer training time, with more memory consumption.
for `max_features`, a good balanced number is usually around 5k - 20k, therefore we will try to train using different numbers in this range. 

`stop_words` are common words like "the", "is", "of"...etc. They don't add much meaning in the classification process. having it can help with training accuracy by making the model focus on other, possibly more important words.
but there may be a risk where it can hurt the performance, therefore we will have a run not including it. 





**TF-IDF_v1**

For this run, we are using `stop_words = english` to remove the common mostly useless repetative words in the english language as explained before.
We are setting `max_features = 10000` for subject, since subjects usually tend to have less words than the body and we wish to only consider important words used in a phishing context. while for body we will take `max_features = 15000` as a start. 

*Notice: after we see the shape, it says 25000 columns, since we combined the subject and body vectors for this run.*

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from scipy.sparse import hstack
import numpy as np

# --- TF-IDF Run #1  ---
vectorizer1_subject = TfidfVectorizer(
    max_features=10000,   
    stop_words="english"
)
vectorizer1_body = TfidfVectorizer(
    max_features=15000,  
    stop_words="english"
)

X1_train_subject = vectorizer1_subject.fit_transform(train_df["subject"].fillna(""))
X1_test_subject = vectorizer1_subject.transform(test_df["subject"].fillna(""))
X1_val_subject = vectorizer1_subject.transform(val_df["subject"].fillna(""))

X1_train_body = vectorizer1_body.fit_transform(train_df["body"].fillna(""))
X1_test_body = vectorizer1_body.transform(test_df["body"].fillna(""))
X1_val_body = vectorizer1_body.transform(val_df["body"].fillna(""))

# we combine subject and body vectors using hstack
X1_train = hstack([X1_train_subject, X1_train_body])
X1_test = hstack([X1_test_subject, X1_test_body])
X1_val = hstack([X1_val_subject, X1_val_body])


y1_train = train_df["isPhishing"]
y1_test = test_df["isPhishing"]
y1_val = val_df["isPhishing"]

print("Run 1 (TF-IDF separate subject+body) shapes:")
print("Train:", X1_train.shape, " Test:", X1_test.shape, " Val:", X1_val.shape)



sparse.save_npz("./vectorizations/tfidf_v1/X1_train_tfidf.npz", X1_train)
sparse.save_npz("./vectorizations/tfidf_v1/X1_test_tfidf.npz", X1_test)
sparse.save_npz("./vectorizations/tfidf_v1/X1_val_tfidf.npz", X1_val)

# --- Save labels as numpy arrays ---
np.save("./vectorizations/tfidf_v1/y1_train_tfidf.npy", y1_train.to_numpy())
np.save("./vectorizations/tfidf_v1/y1_test_tfidf.npy", y1_test.to_numpy())
np.save("./vectorizations/tfidf_v1/y1_val_tfidf.npy", y1_val.to_numpy())





Run 1 (TF-IDF separate subject+body) shapes:
Train: (126035, 25000)  Test: (42012, 25000)  Val: (42012, 25000)


**Preprocessing Method #1: TF-IDF v2**

[Combine subject and body to possibly reduce overfitting caused by having duplicate of key words]


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy import sparse

# --- Combine subject + body into a single column ---
def combine_texts(df):
    return (df["subject"].fillna("") + " " + df["body"].fillna("")).str.strip()

train_texts = combine_texts(train_df)
test_texts = combine_texts(test_df)
val_texts = combine_texts(val_df)

# --- TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(
    max_features=20000,  # combined, so sum of previous max_features
    stop_words="english"
)

X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)
X_val = vectorizer.transform(val_texts)

y_train = train_df["isPhishing"]
y_test = test_df["isPhishing"]
y_val = val_df["isPhishing"]

print("TF-IDF combined subject+body shapes:")
print("Train:", X_train.shape, " Test:", X_test.shape, " Val:", X_val.shape)

# --- Save embeddings ---
sparse.save_npz("./vectorizations/tfidf_v2/X_train_tfidf.npz", X_train)
sparse.save_npz("./vectorizations/tfidf_v2/X_test_tfidf.npz", X_test)
sparse.save_npz("./vectorizations/tfidf_v2/X_val_tfidf.npz", X_val)

# --- Save labels ---
np.save("./vectorizations/tfidf_v2/y_train_tfidf.npy", y_train.to_numpy())
np.save("./vectorizations/tfidf_v2/y_test_tfidf.npy", y_test.to_numpy())
np.save("./vectorizations/tfidf_v2/y_val_tfidf.npy", y_val.to_numpy())


TF-IDF combined subject+body shapes:
Train: (126035, 20000)  Test: (42012, 20000)  Val: (42012, 20000)


**Preprocessing Method #2: SBERT v1**

[Describe SBIRT]
[Why you plan to use it]

In [4]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd

# Load model
model = SentenceTransformer('paraphrase-mpnet-base-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print("Using device:", device)

# Combine subject + body and handle NaN values
def combine_texts(df):
    # Fill NaN with empty string first
    subject = df["subject"].fillna("")
    body = df["body"].fillna("")
    # Combine
    texts = subject + " " + body
    # Strip whitespace and replace empty strings with a placeholder
    texts = texts.str.strip()
    texts = texts.replace("", "empty")  # or use "no content"
    return texts

train_texts = combine_texts(train_df)
test_texts = combine_texts(test_df)
val_texts = combine_texts(val_df)

# Check for any remaining issues
print(f"Train texts with NaN: {train_texts.isna().sum()}")
#print(f"Sample train text: {train_texts.iloc[0]}")

# Encode - REMOVE the device parameter
X_train_sbert = model.encode(train_texts.tolist(), batch_size=64, show_progress_bar=True)
X_test_sbert  = model.encode(test_texts.tolist(), batch_size=64, show_progress_bar=True)
X_val_sbert   = model.encode(val_texts.tolist(), batch_size=64, show_progress_bar=True)

# Labels
y_train_sbert = train_df["isPhishing"]
y_test_sbert  = test_df["isPhishing"]
y_val_sbert   = val_df["isPhishing"]

print("Train embeddings shape:", X_train_sbert.shape)
print("Test embeddings shape:", X_test_sbert.shape)
print("Validation embeddings shape:", X_val_sbert.shape)

# Save embeddings
np.save('./vectorizations/sbert_v1/X_train_sbert.npy', X_train_sbert)
np.save('./vectorizations/sbert_v1/X_test_sbert.npy', X_test_sbert)
np.save('./vectorizations/sbert_v1/X_val_sbert.npy', X_val_sbert)

# Save labels
np.save('./vectorizations/sbert_v1/y_train_sbert.npy', y_train_sbert.to_numpy())
np.save('./vectorizations/sbert_v1/y_test_sbert.npy', y_test_sbert.to_numpy())
np.save('./vectorizations/sbert_v1/y_val_sbert.npy', y_val_sbert.to_numpy())

Using device: cuda
Train texts with NaN: 0


Batches:   0%|          | 0/1970 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Train embeddings shape: (126035, 768)
Test embeddings shape: (42012, 768)
Validation embeddings shape: (42012, 768)


**Preprocessing Method #2: SBERT v2**

[talk about the lighter DL model you will use for this run: all-MiniLM-L6-v2]




In [5]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print("Using device:", device)

# Combine subject + body and handle NaN values
def combine_texts(df):
    # Fill NaN with empty string first
    subject = df["subject"].fillna("")
    body = df["body"].fillna("")
    # Combine
    texts = subject + " " + body
    # Strip whitespace and replace empty strings with a placeholder
    texts = texts.str.strip()
    texts = texts.replace("", "empty")  # or use "no content"
    return texts

train_texts = combine_texts(train_df)
test_texts = combine_texts(test_df)
val_texts = combine_texts(val_df)

# Check for any remaining issues
print(f"Train texts with NaN: {train_texts.isna().sum()}")
#print(f"Sample train text: {train_texts.iloc[0]}")

# Encode - REMOVE the device parameter
X_train_sbert = model.encode(train_texts.tolist(), batch_size=64, show_progress_bar=True)
X_test_sbert  = model.encode(test_texts.tolist(), batch_size=64, show_progress_bar=True)
X_val_sbert   = model.encode(val_texts.tolist(), batch_size=64, show_progress_bar=True)

# Labels
y_train_sbert = train_df["isPhishing"]
y_test_sbert  = test_df["isPhishing"]
y_val_sbert   = val_df["isPhishing"]

print("Train embeddings shape:", X_train_sbert.shape)
print("Test embeddings shape:", X_test_sbert.shape)
print("Validation embeddings shape:", X_val_sbert.shape)

# Save embeddings
np.save('./vectorizations/sbert_v2/X_train_sbert.npy', X_train_sbert)
np.save('./vectorizations/sbert_v2/X_test_sbert.npy', X_test_sbert)
np.save('./vectorizations/sbert_v2/X_val_sbert.npy', X_val_sbert)

# Save labels
np.save('./vectorizations/sbert_v2/y_train_sbert.npy', y_train_sbert.to_numpy())
np.save('./vectorizations/sbert_v2/y_test_sbert.npy', y_test_sbert.to_numpy())
np.save('./vectorizations/sbert_v2/y_val_sbert.npy', y_val_sbert.to_numpy())

Using device: cuda
Train texts with NaN: 0


Batches:   0%|          | 0/1970 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Train embeddings shape: (126035, 384)
Test embeddings shape: (42012, 384)
Validation embeddings shape: (42012, 384)


**Preprocessing Method #2: SBERT v3**

[[paraphrase-distilroberta-base-v2] trying this model of sbert]

[it is not as accurate as the first, but still faster. (paraphrase-mpnet-base-v2)]

[more accurate than (all-MiniLM-L6-v2), and slightly slower]


In [6]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd

# Load model
model = SentenceTransformer('paraphrase-distilroberta-base-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print("Using device:", device)

# Combine subject + body and handle NaN values
def combine_texts(df):
    # Fill NaN with empty string first
    subject = df["subject"].fillna("")
    body = df["body"].fillna("")
    # Combine
    texts = subject + " " + body
    # Strip whitespace and replace empty strings with a placeholder
    texts = texts.str.strip()
    texts = texts.replace("", "empty")  # or use "no content"
    return texts

train_texts = combine_texts(train_df)
test_texts = combine_texts(test_df)
val_texts = combine_texts(val_df)

# Check for any remaining issues
print(f"Train texts with NaN: {train_texts.isna().sum()}")
#print(f"Sample train text: {train_texts.iloc[0]}")

# Encode - REMOVE the device parameter
X_train_sbert = model.encode(train_texts.tolist(), batch_size=64, show_progress_bar=True)
X_test_sbert  = model.encode(test_texts.tolist(), batch_size=64, show_progress_bar=True)
X_val_sbert   = model.encode(val_texts.tolist(), batch_size=64, show_progress_bar=True)

# Labels
y_train_sbert = train_df["isPhishing"]
y_test_sbert  = test_df["isPhishing"]
y_val_sbert   = val_df["isPhishing"]

print("Train embeddings shape:", X_train_sbert.shape)
print("Test embeddings shape:", X_test_sbert.shape)
print("Validation embeddings shape:", X_val_sbert.shape)

# Save embeddings
np.save('./vectorizations/sbert_v3/X_train_sbert.npy', X_train_sbert)
np.save('./vectorizations/sbert_v3/X_test_sbert.npy', X_test_sbert)
np.save('./vectorizations/sbert_v3/X_val_sbert.npy', X_val_sbert)

# Save labels
np.save('./vectorizations/sbert_v3/y_train_sbert.npy', y_train_sbert.to_numpy())
np.save('./vectorizations/sbert_v3/y_test_sbert.npy', y_test_sbert.to_numpy())
np.save('./vectorizations/sbert_v3/y_val_sbert.npy', y_val_sbert.to_numpy())

Using device: cuda
Train texts with NaN: 0


Batches:   0%|          | 0/1970 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Train embeddings shape: (126035, 768)
Test embeddings shape: (42012, 768)
Validation embeddings shape: (42012, 768)


**Preprocessing Method #2: SBERT v4*
*
[Check the multilingual sbert model]

distiluse-base-multilingual-cased-v2

In [7]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd

# Load model
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print("Using device:", device)

# Combine subject + body and handle NaN values
def combine_texts(df):
    # Fill NaN with empty string first
    subject = df["subject"].fillna("")
    body = df["body"].fillna("")
    # Combine
    texts = subject + " " + body
    # Strip whitespace and replace empty strings with a placeholder
    texts = texts.str.strip()
    texts = texts.replace("", "empty")  # or use "no content"
    return texts

train_texts = combine_texts(train_df)
test_texts = combine_texts(test_df)
val_texts = combine_texts(val_df)

# Check for any remaining issues
print(f"Train texts with NaN: {train_texts.isna().sum()}")
#print(f"Sample train text: {train_texts.iloc[0]}")

# Encode - REMOVE the device parameter
X_train_sbert = model.encode(train_texts.tolist(), batch_size=64, show_progress_bar=True)
X_test_sbert  = model.encode(test_texts.tolist(), batch_size=64, show_progress_bar=True)
X_val_sbert   = model.encode(val_texts.tolist(), batch_size=64, show_progress_bar=True)

# Labels
y_train_sbert = train_df["isPhishing"]
y_test_sbert  = test_df["isPhishing"]
y_val_sbert   = val_df["isPhishing"]

print("Train embeddings shape:", X_train_sbert.shape)
print("Test embeddings shape:", X_test_sbert.shape)
print("Validation embeddings shape:", X_val_sbert.shape)

# Save embeddings
np.save('./vectorizations/sbert_v4/X_train_sbert.npy', X_train_sbert)
np.save('./vectorizations/sbert_v4/X_test_sbert.npy', X_test_sbert)
np.save('./vectorizations/sbert_v4/X_val_sbert.npy', X_val_sbert)

# Save labels
np.save('./vectorizations/sbert_v4/y_train_sbert.npy', y_train_sbert.to_numpy())
np.save('./vectorizations/sbert_v4/y_test_sbert.npy', y_test_sbert.to_numpy())
np.save('./vectorizations/sbert_v4/y_val_sbert.npy', y_val_sbert.to_numpy())

Using device: cuda
Train texts with NaN: 0


Batches:   0%|          | 0/1970 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Batches:   0%|          | 0/657 [00:00<?, ?it/s]

Train embeddings shape: (126035, 512)
Test embeddings shape: (42012, 512)
Validation embeddings shape: (42012, 512)
