In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def convert_to_dense_if_sparse(X):
    if hasattr(X, 'toarray'):
        return X.toarray()
    return X

In [6]:
class LogisticRegression_Custom:
    def __init__(self, learning_rate=0.01, nums_of_iteration=1000, C=None):
        self.learning_rate = learning_rate
        self.nums_of_iteration = nums_of_iteration
        self.C = C
    
    def fit(self, X, y):
        self.X = convert_to_dense_if_sparse(X)
        self.y = y.to_numpy()

        m, n = self.X.shape
        self.w = np.zeros(n)
        self.b = 0

        for i in range(self.nums_of_iteration):
            z = self.X.dot(self.w) + self.b
            y_hat = self.sigmoid(z)
        
            dw = (1 / m) * np.dot(self.X.T, (y_hat - self.y)) + ((1 / self.C) / m) * self.w
            db = (1 / m) * np.sum(y_hat - self.y)

            self.w = self.w - (self.learning_rate * dw)
            self.b = self.b - (self.learning_rate * db)
        
        return self

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
        
    def predict(self, X):
        X = convert_to_dense_if_sparse(X)
        y_pred = 1 / (1 + np.exp(-(X.dot(self.w) + self.b)))
        return np.where(y_pred >= 0.5, 1, 0) 

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        
        true_positive = np.sum((y_pred == 1) & (y == 1))
        true_negative = np.sum((y_pred == 0) & (y == 0))
        false_positive = np.sum((y_pred == 1) & (y == 0))
        false_negative = np.sum((y_pred == 0) & (y == 1))

        accuracy_score = (true_positive + true_negative) / y.shape[0]
        precision_score = true_positive / (true_positive + false_positive)
        recall_score = true_positive / (true_positive + false_negative)
        f1_score = (2 * precision_score * recall_score / (precision_score + recall_score))
        
        return {
        "accuracy_score": accuracy_score,
        "precision_score": precision_score,
        "recall_score": recall_score,
        "f1_score": f1_score
        }

In [8]:
C = np.logspace(-2, 2, 10)
C_ = C[7]

In [10]:
algo_lr_custom = LogisticRegression_Custom(learning_rate=1.0, nums_of_iteration=2000, C=C_)

DATASET 1

In [13]:
cleaned_df_1 = pd.read_csv("cleaned_kaggle_phishing_email.csv")
cleaned_df_1.head()

Unnamed: 0,cleaned text,label enc
0,6 1100 disc uniformitarian 1086 sex lang dick ...,0
1,side galicismo galicismo spanish term name imp...,0
2,equistar deal ticket still avail assist robert...,0
3,hello hot lil horni toy one dream open mind pe...,1
4,softwar incred low price 86 lower draperi seve...,1


In [14]:
X, y = cleaned_df_1["cleaned text"], cleaned_df_1["label enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

print(f"X_train Shape: {X_train.shape} - y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape} - y_test Shape: {y_test.shape}")

X_train Shape: (12667,) - y_train Shape: (12667,)
X_test Shape: (5429,) - y_test Shape: (5429,)


In [15]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((12667, 10000), (5429, 10000))

In [17]:
sparsity = (1.0 - (X_train_tfidf.nnz / float(X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))) * 100
print(f"Sparsity Percentage of the TF-IDF Matrix: {sparsity:.2f}%")

Sparsity Percentage of the TF-IDF Matrix: 99.00%


In [19]:
print(X_train_tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.02782431 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [20]:
model_lr_custom_1 = algo_lr_custom.fit(X_train_tfidf, y_train)

In [21]:
model_lr_custom_1.evaluate(X_test_tfidf, y_test)

{'accuracy_score': 0.9563455516669737,
 'precision_score': 0.9728779507785033,
 'recall_score': 0.9136792452830189,
 'f1_score': 0.9423497932376551}

DATASET 2

In [23]:
cleaned_df_2 = pd.read_csv("cleaned_CEAS-08.csv")
cleaned_df_2.head()

Unnamed: 0,cleaned text,label enc
0,buck troubl caus small dimens soon becom lover...,1
1,upgrad sex pleasur techniqu,1
2,daili top 10 cnn com top video stori aug 1 200...,1
3,would anyon object remov list tld basic dead f...,0
4,welcomefastshippingcustomersupport,1


In [24]:
X, y = cleaned_df_2["cleaned text"], cleaned_df_2["label enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

print(f"X_train Shape: {X_train.shape} - y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape} - y_test Shape: {y_test.shape}")

X_train Shape: (27406,) - y_train Shape: (27406,)
X_test Shape: (11746,) - y_test Shape: (11746,)


In [25]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((27406, 10000), (11746, 10000))

In [26]:
sparsity = (1.0 - (X_train_tfidf.nnz / float(X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))) * 100
print(f"Sparsity Percentage of the TF-IDF Matrix: {sparsity:.2f}%")

Sparsity Percentage of the TF-IDF Matrix: 99.09%


In [27]:
print(X_train_tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
model_lr_custom_2 = algo_lr_custom.fit(X_train_tfidf, y_train)

In [29]:
model_lr_custom_2.evaluate(X_test_tfidf, y_test)

{'accuracy_score': 0.9811850842840116,
 'precision_score': 0.9803454437164979,
 'recall_score': 0.9866626704630601,
 'f1_score': 0.9834939129135857}

DATASET 3

In [37]:
cleaned_df_3 = pd.read_csv("cleaned_Enron.csv")
cleaned_df_3.head()

Unnamed: 0,cleaned text,label enc
0,perform origin messag issuealert scientech com...,0
1,w w highhest qualiti medd great offfer v codin...,1
2,make r cher might need bundi 1 w ndow x p pro ...,1
3,drug chemic ident brand name equival except pr...,1
4,team project request spreadsheet develop facil...,0


In [38]:
X, y = cleaned_df_3["cleaned text"], cleaned_df_3["label enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

print(f"X_train Shape: {X_train.shape} - y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape} - y_test Shape: {y_test.shape}")

X_train Shape: (20827,) - y_train Shape: (20827,)
X_test Shape: (8926,) - y_test Shape: (8926,)


In [39]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((20827, 10000), (8926, 10000))

In [40]:
sparsity = (1.0 - (X_train_tfidf.nnz / float(X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))) * 100
print(f"Sparsity Percentage of the TF-IDF Matrix: {sparsity:.2f}%")

Sparsity Percentage of the TF-IDF Matrix: 99.09%


In [42]:
print(X_train_tfidf.toarray())

[[0.0292113  0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.00989535 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [44]:
model_lr_custom_3 = algo_lr_custom.fit(X_train_tfidf, y_train)

In [45]:
model_lr_custom_3.evaluate(X_test_tfidf, y_test)

{'accuracy_score': 0.9758010306968407,
 'precision_score': 0.9573778086634237,
 'recall_score': 0.9923169267707083,
 'f1_score': 0.9745343079462392}