In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def convert_to_dense_if_sparse(X):
    if hasattr(X, 'toarray'):
        return X.toarray()
    return X

In [4]:
class SVC_Custom:
    def __init__(self, learning_rate=1.0, nums_of_iteration=1000, kernel='linear', degree=3, C=1.0, gamma=1.0):
        self.learning_rate = learning_rate
        self.nums_of_iteration = nums_of_iteration
        self.kernel = {
            'linear':lambda x, y: np.dot(x, y.T),
            'poly': lambda x, y: np.dot(x, y.T)**degree,
            'rbf': lambda x, y: np.exp(-gamma * (np.sum(x**2, axis=1)[:, np.newaxis] + np.sum(y**2, axis=1) - 2 * np.dot(x, y.T)))
        }[kernel]
        self.C = C

    def restrict_to_square(self, t, v0, u):
        t = (np.clip(v0 + (t * u), 0, self.C) - v0)[1] / u[1]
        return (np.clip(v0 + (t * u), 0, self.C) - v0)[0] / u[0]

    def fit(self, X, y):
        self.X = convert_to_dense_if_sparse(X)
        self.y = 2 * y.to_numpy() - 1
        
        self.lambdas = np.zeros_like(self.y, dtype=float)
        self.K = self.kernel(self.X, self.X) * self.y[:, np.newaxis] * self.y

        for i in range(self.nums_of_iteration):
            for idxM in range(len(self.lambdas)):
                idxL = np.random.randint(0, len(self.lambdas))
                Q = self.K[[[idxM, idxM], [idxL, idxL]], [[idxM, idxL], [idxM, idxL]]]
                v0 = self.lambdas[[idxM, idxL]]
                k0 = 1 - np.sum(self.lambdas * self.K[[idxM, idxL]], axis=1)
                u = np.array([-self.y[idxL], self.y[idxM]])
                t_max = np.dot(k0, u) / (np.dot(np.dot(Q, u), u) + 1e-15)
                self.lambdas[[idxM, idxL]] = v0 + self.learning_rate * u * self.restrict_to_square(t_max, v0, u)

        idx, = np.nonzero(self.lambdas > 1e-15)
        self.b = np.sum((1 - np.sum(self.K[idx] * self.lambdas, axis=1)) * self.y[idx]) / len(idx)

        return self

    def decision_function(self, X):
        return np.sum(self.kernel(X, self.X) * self.y * self.lambdas, axis=1) + self.b

    def predict(self, X):
        X = convert_to_dense_if_sparse(X)
        y_pred = self.decision_function(X)
        return np.sign(y_pred)

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        y = 2 * y.to_numpy() - 1

        true_positive = np.sum((y_pred == 1) & (y == 1))
        true_negative = np.sum((y_pred == -1) & (y == -1))
        false_positive = np.sum((y_pred == 1) & (y == -1))
        false_negative = np.sum((y_pred == -1) & (y == 1))

        accuracy_score = (true_positive + true_negative) / y.shape[0]
        precision_score = true_positive / (true_positive + false_positive)
        recall_score = true_positive / (true_positive + false_negative)
        f1_score = (2 * precision_score * recall_score / (precision_score + recall_score))

        return {
        "accuracy_score": accuracy_score,
        "precision_score": precision_score,
        "recall_score": recall_score,
        "f1_score": f1_score
        }

In [8]:
def compute_gamma(X):
    n_features = X.shape[1]
    variance = np.var(convert_to_dense_if_sparse(X), axis=0).mean()
    return 1 / (n_features * variance)

DATASET 1

In [11]:
cleaned_df_1 = pd.read_csv("cleaned_kaggle_phishing_email.csv")
cleaned_df_1.head()

Unnamed: 0,cleaned text,label enc
0,6 1100 disc uniformitarian 1086 sex lang dick ...,0
1,side galicismo galicismo spanish term name imp...,0
2,equistar deal ticket still avail assist robert...,0
3,hello hot lil horni toy one dream open mind pe...,1
4,softwar incred low price 86 lower draperi seve...,1


In [12]:
X, y = cleaned_df_1["cleaned text"], cleaned_df_1["label enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

print(f"X_train Shape: {X_train.shape} - y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape} - y_test Shape: {y_test.shape}")

X_train Shape: (12667,) - y_train Shape: (12667,)
X_test Shape: (5429,) - y_test Shape: (5429,)


In [13]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((12667, 10000), (5429, 10000))

In [14]:
sparsity = (1.0 - (X_train_tfidf.nnz / float(X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))) * 100
print(f"Sparsity Percentage of the TF-IDF Matrix: {sparsity:.2f}%")

Sparsity Percentage of the TF-IDF Matrix: 99.00%


In [17]:
print(X_train_tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.02782431 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [18]:
gamma_ = compute_gamma(X_train_tfidf)

In [19]:
algo_svc_custom_1 = SVC_Custom(kernel='rbf', C=10.0, gamma=gamma_)

In [20]:
model_svc_custom_1 = algo_svc_custom_1.fit(X_train_tfidf, y_train)

In [21]:
model_svc_custom_1.evaluate(X_test_tfidf, y_test)

{'accuracy_score': 0.9786332657948057,
 'precision_score': 0.9775977121067684,
 'recall_score': 0.9674528301886792,
 'f1_score': 0.9724988146040778}

DATASET 2

In [10]:
cleaned_df_2 = pd.read_csv("cleaned_CEAS-08.csv")
cleaned_df_2.head()

Unnamed: 0,cleaned text,label enc
0,buck troubl caus small dimens soon becom lover...,1
1,upgrad sex pleasur techniqu,1
2,daili top 10 cnn com top video stori aug 1 200...,1
3,would anyon object remov list tld basic dead f...,0
4,welcomefastshippingcustomersupport,1


In [12]:
X, y = cleaned_df_2["cleaned text"], cleaned_df_2["label enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

print(f"X_train Shape: {X_train.shape} - y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape} - y_test Shape: {y_test.shape}")

X_train Shape: (27406,) - y_train Shape: (27406,)
X_test Shape: (11746,) - y_test Shape: (11746,)


In [14]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((27406, 10000), (11746, 10000))

In [15]:
sparsity = (1.0 - (X_train_tfidf.nnz / float(X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))) * 100
print(f"Sparsity Percentage of the TF-IDF Matrix: {sparsity:.2f}%")

Sparsity Percentage of the TF-IDF Matrix: 99.09%


In [16]:
print(X_train_tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
gamma_ = compute_gamma(X_train_tfidf)

In [18]:
algo_svc_custom_2 = SVC_Custom(kernel='rbf', C=10.0, gamma=gamma_)

In [24]:
model_svc_custom_2 = algo_svc_custom_2.fit(X_train_tfidf, y_train)

In [25]:
model_svc_custom_2.evaluate(X_test_tfidf, y_test)

{'accuracy_score': 0.994295930529542,
 'precision_score': 0.9938696172248804,
 'recall_score': 0.9961037014835906,
 'f1_score': 0.9949854052840357}

DATASET 3

In [29]:
cleaned_df_3 = pd.read_csv("cleaned_Enron.csv")
cleaned_df_3.head()

Unnamed: 0,cleaned text,label enc
0,perform origin messag issuealert scientech com...,0
1,w w highhest qualiti medd great offfer v codin...,1
2,make r cher might need bundi 1 w ndow x p pro ...,1
3,drug chemic ident brand name equival except pr...,1
4,team project request spreadsheet develop facil...,0


In [30]:
X, y = cleaned_df_3["cleaned text"], cleaned_df_3["label enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

print(f"X_train Shape: {X_train.shape} - y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape} - y_test Shape: {y_test.shape}")

X_train Shape: (20827,) - y_train Shape: (20827,)
X_test Shape: (8926,) - y_test Shape: (8926,)


In [31]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((20827, 10000), (8926, 10000))

In [32]:
sparsity = (1.0 - (X_train_tfidf.nnz / float(X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))) * 100
print(f"Sparsity Percentage of the TF-IDF Matrix: {sparsity:.2f}%")

Sparsity Percentage of the TF-IDF Matrix: 99.09%


In [33]:
print(X_train_tfidf.toarray())

[[0.0292113  0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.00989535 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [36]:
gamma_ = compute_gamma(X_train_tfidf)

In [37]:
algo_svc_custom_3 = SVC_Custom(kernel='rbf', C=10.0, gamma=gamma_)

In [38]:
model_svc_custom_3 = algo_svc_custom_3.fit(X_train_tfidf, y_train)

In [39]:
model_svc_custom_3.evaluate(X_test_tfidf, y_test)

{'accuracy_score': 0.988908805736052,
 'precision_score': 0.983357108892059,
 'recall_score': 0.9930372148859544,
 'f1_score': 0.9881734559789751}