In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Import dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
cleaned_df = pd.read_csv("/content/drive/MyDrive/Dataset/cleaned_phishing_email.csv")
cleaned_df.head()

Unnamed: 0,cleaned text,label enc
0,6 1100 disc uniformitarian 1086 sex lang dick ...,0
1,side galicismo galicismo spanish term name imp...,0
2,equistar deal ticket still avail assist robert...,0
3,hello hot lil horni toy one dream open mind pe...,1
4,softwar incred low price 86 lower draperi seve...,1


In [5]:
"""
cleaned_df = pd.read_csv("kaggle_cleaned_phishing_email.csv")
cleaned_df.head()
"""

'\ncleaned_df = pd.read_csv("kaggle_cleaned_phishing_email.csv")\ncleaned_df.head()\n'

In [6]:
X, y = cleaned_df["cleaned text"], cleaned_df["label enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

print(f"X_train Shape: {X_train.shape} - y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape} - y_test Shape: {y_test.shape}")

X_train Shape: (12667,) - y_train Shape: (12667,)
X_test Shape: (5429,) - y_test Shape: (5429,)


In [7]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((12667, 10000), (5429, 10000))

In [8]:
sparsity = (1.0 - (X_train_tfidf.nnz / float(X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))) * 100
print(f"Sparsity Percentage of the TF-IDF Matrix: {sparsity:.2f}%")

Sparsity Percentage of the TF-IDF Matrix: 99.00%


In [9]:
print(X_train_tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.02782431 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [10]:
def convert_to_dense_if_sparse(X):
    if hasattr(X, 'toarray'):
        return X.toarray()
    return X

In [11]:
class SVC_Custom:
    def __init__(self, nums_of_iteration=1000, kernel='linear', degree=3, C=1.0, gamma=1.0, costs=None):
        self.nums_of_iteration = nums_of_iteration
        self.kernel = {
            'linear':lambda x,y: np.dot(x, y.T),
            'poly':lambda x,y: np.dot(x, y.T)**degree,
            'rbf': lambda x, y: np.exp(-gamma * np.linalg.norm(x[:, np.newaxis] - y, axis=-1)**2)
        }[kernel]
        self.C = C
        self.gamma = gamma
        self.costs = costs if costs is not None else []

    def restrict_to_square(self, t, v0, u):
        t = (np.clip(v0 + (t * u), 0, self.C) - v0)[1] / u[1]
        return (np.clip(v0 + (t * u), 0, self.C) - v0)[0] / u[0]

    def fit(self, X, y):
        self.X = convert_to_dense_if_sparse(X)
        self.y = 2 * y.to_numpy() - 1
        self.lambdas = np.zeros_like(self.y, dtype=float)
        self.K = self.kernel(self.X, self.X) * self.y[:, np.newaxis] * self.y

        for i in range(self.nums_of_iteration):
            for idxM in range(len(self.lambdas)):
                idxL = np.random.randint(0, len(self.lambdas))
                Q = self.K[[[idxM, idxM], [idxL, idxL]], [[idxM, idxL], [idxM, idxL]]]
                v0 = self.lambdas[[idxM, idxL]]
                k0 = 1 - np.sum(self.lambdas * self.K[[idxM, idxL]], axis=1)
                u = np.array([-self.y[idxL], self.y[idxM]])
                t_max = np.dot(k0, u) / (np.dot(np.dot(Q, u), u) + 1e-15)
                self.lambdas[[idxM, idxL]] = v0 + u * self.restrict_to_square(t_max, v0, u)

        idx, = np.nonzero(self.lambdas > 1e-15)
        self.b = np.sum((1 - np.sum(self.K[idx] * self.lambdas, axis=1)) * self.y[idx]) / len(idx)

        return self

    def compute_cost(self):
        return 0.5 * np.sum(self.lambdas[:, np.newaxis] * self.lambdas * self.y[:, np.newaxis] * self.y * self.K) - np.sum(self.lambdas)

    def decision_function(self, X):
        return np.sum(self.kernel(X, self.X) * self.y * self.lambdas, axis=1) + self.b

    def predict(self, X):
        X = convert_to_dense_if_sparse(X)
        y_pred = self.decision_function(X)
        return np.sign(y_pred)

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        y = 2 * y.to_numpy() - 1

        true_positive = np.sum((y_pred == 1) & (y == 1))
        true_negative = np.sum((y_pred == -1) & (y == -1))
        false_positive = np.sum((y_pred == 1) & (y == -1))
        false_negative = np.sum((y_pred == -1) & (y == 1))

        accuracy_score = (true_positive + true_negative) / y.shape[0]
        precision_score = true_positive / (true_positive + false_positive)
        recall_score = true_positive / (true_positive + false_negative)
        f1_score = (2 * precision_score * recall_score / (precision_score + recall_score))

        return {
        "accuracy_score": accuracy_score,
        "precision_score": precision_score,
        "recall_score": recall_score,
        "f1_score": f1_score
        }

In [12]:
def compute_gamma(X):
    n_features = X.shape[1]
    variance = np.var(convert_to_dense_if_sparse(X), axis=0).mean()
    return 1 / (n_features * variance)

In [13]:
gamma_ = compute_gamma(X_train_tfidf)
algo_svc_2 = SVC_Custom(C=10.0, gamma=gamma_)

In [None]:
model_svc_2 = algo_svc_2.fit(X_train_tfidf, y_train)

In [None]:
model_svc_2.evaluate(X_test_tfidf, y_test)