In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tabulate import tabulate
from bayes_opt import BayesianOptimization
from sklearn.model_selection import GridSearchCV

In [43]:
file_path = 'spam.csv'
data = pd.read_csv(file_path)
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})


In [44]:
def clean_text(text):
    # Removing punctuation marks and converting to lowercase
    return ''.join([char for char in text if char not in string.punctuation]).lower()

data['Cleaned_Message'] = data['Message'].apply(clean_text)

def preprocess_text(text):
    # Tokenizing, removing stopwords, and performing stemming
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return ' '.join(processed_words)

# Applying text preprocessing
X = data['Cleaned_Message'].apply(preprocess_text)
y = data['Category']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

DEFAULT PARAMETER SETTINGS FOR SVM AND NAIVE BAYES

In [45]:
def train_model(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    return model.predict(X_test)

def calculate_metrics(y_test, predictions):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    specificity = tn / (tn + fp)
    return [accuracy, precision, recall, f1, specificity]

def print_table(model_name, metrics):
    row = [model_name] + ["{:.4f}%".format(m * 100) for m in metrics]
    return row

svm_model = SVC(kernel='linear', random_state=42)
svm_pred = train_model(svm_model, X_train_vectorized, y_train, X_test_vectorized)
svm_metrics = calculate_metrics(y_test, svm_pred)

nb_model = MultinomialNB()
nb_pred = train_model(nb_model, X_train_vectorized, y_train, X_test_vectorized)
nb_metrics = calculate_metrics(y_test, nb_pred)

data = [print_table("SVM", svm_metrics), print_table("Naive Bayes", nb_metrics)]

headers = ["Model", "Accuracy", "Precision", "Recall", "F1 Score", "Specificity"]
table = tabulate(data, headers, tablefmt="fancy_grid")
print(table)

╒═════════════╤════════════╤═════════════╤══════════╤════════════╤═══════════════╕
│ Model       │ Accuracy   │ Precision   │ Recall   │ F1 Score   │ Specificity   │
╞═════════════╪════════════╪═════════════╪══════════╪════════════╪═══════════════╡
│ SVM         │ 98.4753%   │ 99.2537%    │ 89.2617% │ 93.9929%   │ 99.8965%      │
├─────────────┼────────────┼─────────────┼──────────┼────────────┼───────────────┤
│ Naive Bayes │ 98.3857%   │ 97.1223%    │ 90.6040% │ 93.7500%   │ 99.5859%      │
╘═════════════╧════════════╧═════════════╧══════════╧════════════╧═══════════════╛


BAYESIAN OPTIMIZATION PARAMETER TUNING FOR SVM AND NAIVE BAYES

In [46]:
def train_model_with_params(C, gamma, kernel):
    """Train the model with given hyperparameters and return the accuracy"""
    model = SVC(C=C, gamma=gamma, kernel='linear' if kernel == 0 else 'rbf', random_state=42)
    model.fit(X_train_vectorized, y_train)
    predictions = model.predict(X_test_vectorized)
    return accuracy_score(y_test, predictions)

search_space = {'C': (0.01, 10), 'gamma': (0.001, 1), 'kernel': (0, 1)}
optimizer = BayesianOptimization(f=train_model_with_params, pbounds=search_space, random_state=42, verbose=2)
optimizer.maximize(init_points=5, n_iter=15)

# Get the best parameters from Bayesian optimization
best_params = optimizer.max

# Adjusting the 'kernel' parameter to correspond with the SVC parameter
best_params['params']['kernel'] = 'linear' if best_params['params']['kernel'] < 0.5 else 'rbf'

# Train the SVM model with the best hyperparameters
best_model = SVC(**best_params['params'], random_state=42)
best_model.fit(X_train_vectorized, y_train)
predictions_with_best_params = best_model.predict(X_test_vectorized)

|   iter    |  target   |     C     |   gamma   |  kernel   |
-------------------------------------------------------------
| [0m1        [0m | [0m0.9058   [0m | [0m3.752    [0m | [0m0.9508   [0m | [0m0.732    [0m |
| [95m2        [0m | [95m0.9659   [0m | [95m5.991    [0m | [95m0.1569   [0m | [95m0.156    [0m |
| [0m3        [0m | [0m0.8744   [0m | [0m0.5903   [0m | [0m0.8663   [0m | [0m0.6011   [0m |
| [95m4        [0m | [95m0.983    [0m | [95m7.084    [0m | [95m0.02156  [0m | [95m0.9699   [0m |
| [0m5        [0m | [0m0.9525   [0m | [0m8.326    [0m | [0m0.2131   [0m | [0m0.1818   [0m |
| [0m6        [0m | [0m0.9049   [0m | [0m6.88     [0m | [0m1.0      [0m | [0m1.0      [0m |
| [0m7        [0m | [0m0.983    [0m | [0m7.112    [0m | [0m0.02232  [0m | [0m0.9983   [0m |
| [95m8        [0m | [95m0.9874   [0m | [95m7.158    [0m | [95m0.001    [0m | [95m0.0      [0m |
| [0m9        [0m | [0m0.9695   [0m | [0

In [47]:
def train_model_with_alpha(alpha):
    """Train the model with given hyperparameters and return the accuracy"""
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train_vectorized, y_train)
    predictions = model.predict(X_test_vectorized)
    return accuracy_score(y_test, predictions)

nb_search_space = {'alpha': (0.01, 1)}
nb_optimizer = BayesianOptimization(f=train_model_with_alpha, pbounds=nb_search_space, random_state=42, verbose=2)
nb_optimizer.maximize(init_points=5, n_iter=15)

# Get the best parameter from Bayesian optimization
best_params = nb_optimizer.max

# Train the Naive Bayes model with the best hyperparameters
best_model = MultinomialNB(**best_params['params'])
best_model.fit(X_train_vectorized, y_train)
nb_predictions_with_best_params = best_model.predict(X_test_vectorized)

|   iter    |  target   |   alpha   |
-------------------------------------
| [0m1        [0m | [0m0.9848   [0m | [0m0.3808   [0m |
| [0m2        [0m | [0m0.9839   [0m | [0m0.9512   [0m |
| [0m3        [0m | [0m0.9839   [0m | [0m0.7347   [0m |
| [0m4        [0m | [0m0.9839   [0m | [0m0.6027   [0m |
| [0m5        [0m | [0m0.9848   [0m | [0m0.1645   [0m |
| [0m6        [0m | [0m0.9821   [0m | [0m0.01     [0m |
| [0m7        [0m | [0m0.9848   [0m | [0m0.1644   [0m |
| [0m8        [0m | [0m0.9848   [0m | [0m0.2858   [0m |
| [0m9        [0m | [0m0.9848   [0m | [0m0.4748   [0m |
| [0m10       [0m | [0m0.9848   [0m | [0m0.2242   [0m |
| [0m11       [0m | [0m0.9839   [0m | [0m0.8427   [0m |
| [0m12       [0m | [0m0.9848   [0m | [0m0.4304   [0m |
| [0m13       [0m | [0m0.9848   [0m | [0m0.3321   [0m |
| [0m14       [0m | [0m0.9848   [0m | [0m0.1906   [0m |
| [0m15       [0m | [0m0.9848   [0m | [0m0.5137   

In [48]:
# Support Vector Machine (SVM)
svm_accuracy_bayes = accuracy_score(y_test, svm_pred_bayes)
svm_precision_bayes = precision_score(y_test, svm_pred_bayes)
svm_recall_bayes = recall_score(y_test, svm_pred_bayes)
svm_f1_bayes = f1_score(y_test, svm_pred_bayes)
tn, fp, fn, tp = confusion_matrix(y_test, svm_pred_bayes).ravel()
svm_bayes_specificity = tn / (tn + fp)
# Naive Bayes
nb_accuracy_bayes = accuracy_score(y_test, nb_pred_bayes)
nb_precision_bayes = precision_score(y_test, nb_pred_bayes)
nb_recall_bayes = recall_score(y_test, nb_pred_bayes)
nb_f1_bayes = f1_score(y_test, nb_pred_bayes)
tn, fp, fn, tp = confusion_matrix(y_test, nb_pred_bayes).ravel()
nb_bayes_specificity = tn / (tn + fp)

data = [
    ["Support Vector Machine (SVM)", "{:.4f}%".format(svm_accuracy * 100), "{:.4f}%".format(
        svm_precision * 100), "{:.4f}%".format(svm_recall * 100), "{:.4f}%".format(svm_f1 * 100), "{:.4f}%".format(svm_specificity * 100)],
    ["Naive Bayes", "{:.4f}%".format(nb_accuracy * 100), "{:.4f}%".format(
        nb_precision * 100), "{:.4f}%".format(nb_recall * 100), "{:.4f}%".format(nb_f1 * 100), "{:.4f}%".format(nb_specificity * 100)],
    ["Bayes Optimized SVM", "{:.4f}%".format(svm_accuracy_bayes * 100), "{:.4f}%".format(
        svm_precision_bayes * 100), "{:.4f}%".format(svm_recall_bayes * 100), "{:.4f}%".format(svm_f1_bayes * 100), "{:.4f}%".format(svm_bayes_specificity * 100)],
    ["Bayes Optimized Naive Bayes", "{:.4f}%".format(nb_accuracy_bayes * 100), "{:.4f}%".format(
        nb_precision_bayes * 100), "{:.4f}%".format(nb_recall_bayes * 100), "{:.4f}%".format(nb_f1 * 100), "{:.4f}%".format(nb_bayes_specificity * 100)]
]

# Menentukan header tabel
headers = ["Model", "Accuracy", "Precision", "Recall", "F1 Score", "Specificity"]

# Mencetak tabel evaluasi
table = tabulate(data, headers, tablefmt="fancy_grid")
print(table)

╒══════════════════════════════╤════════════╤═════════════╤══════════╤════════════╤═══════════════╕
│ Model                        │ Accuracy   │ Precision   │ Recall   │ F1 Score   │ Specificity   │
╞══════════════════════════════╪════════════╪═════════════╪══════════╪════════════╪═══════════════╡
│ Support Vector Machine (SVM) │ 98.4753%   │ 99.2537%    │ 89.2617% │ 93.9929%   │ 99.8965%      │
├──────────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────────┤
│ Naive Bayes                  │ 98.3857%   │ 97.1223%    │ 90.6040% │ 93.7500%   │ 99.5859%      │
├──────────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────────┤
│ Bayes Optimized SVM          │ 98.8341%   │ 99.2754%    │ 91.9463% │ 95.4704%   │ 99.8965%      │
├──────────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────────┤
│ Bayes Optimized Naive Bayes  │ 98.4753%   │ 97.1429%    │ 91.2752% │ 93.7500%   │ 99.5859%      │


GRID SEARCH PARAMETER TUNING FOR SVM AND NAIVE BAYES

In [49]:
def param_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

svm_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

nb_param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1]
}

svm_best_model = param_search(SVC(random_state=42), svm_param_grid, X_train_vectorized, y_train)
svm_pred_grid = svm_best_model.predict(X_test_vectorized)

nb_best_model = param_search(MultinomialNB(), nb_param_grid, X_train_vectorized, y_train)
nb_pred_grid = nb_best_model.predict(X_test_vectorized)

In [50]:
# Calculate accuracy, precision, recall, and F1 score for SVM
svm_accuracy_grid = accuracy_score(y_test, svm_pred_grid)
svm_precision_grid = precision_score(y_test, svm_pred_grid)
svm_recall_grid = recall_score(y_test, svm_pred_grid)
svm_f1_grid = f1_score(y_test, svm_pred_grid)
tn, fp, fn, tp = confusion_matrix(y_test, svm_pred_grid).ravel()
svm_grid_specificity = tn / (tn + fp)
# Calculate accuracy, precision, recall, and F1 score for Naive Bayes
nb_accuracy_grid = accuracy_score(y_test, nb_pred_grid)
nb_precision_grid = precision_score(y_test, nb_pred_grid)
nb_recall_grid = recall_score(y_test, nb_pred_grid)
nb_f1_grid = f1_score(y_test, nb_pred_grid)
tn, fp, fn, tp = confusion_matrix(y_test, nb_pred_grid).ravel()
nb_grid_specificity = tn / (tn + fp)

# Mengatur data evaluasi dalam bentuk tabel
data = [
    ["Support Vector Machine (SVM)", "{:.4f}%".format(svm_accuracy * 100), "{:.4f}%".format(
        svm_precision * 100), "{:.4f}%".format(svm_recall * 100), "{:.4f}%".format(svm_f1 * 100), "{:.4f}%".format(svm_specificity * 100)],
    ["Naive Bayes", "{:.4f}%".format(nb_accuracy * 100), "{:.4f}%".format(
        nb_precision * 100), "{:.4f}%".format(nb_recall * 100), "{:.4f}%".format(nb_f1 * 100), "{:.4f}%".format(nb_specificity * 100)],
    ["Bayes Optimized SVM", "{:.4f}%".format(svm_accuracy_bayes * 100), "{:.4f}%".format(
        svm_precision_bayes * 100), "{:.4f}%".format(svm_recall_bayes * 100), "{:.4f}%".format(svm_f1_bayes * 100), "{:.4f}%".format(svm_bayes_specificity * 100)],
    ["Bayes Optimized Naive Bayes", "{:.4f}%".format(nb_accuracy_bayes * 100), "{:.4f}%".format(
        nb_precision_bayes * 100), "{:.4f}%".format(nb_recall_bayes * 100), "{:.4f}%".format(nb_f1 * 100), "{:.4f}%".format(nb_bayes_specificity * 100)],
    ["Grid Support Vector Machine (SVM)", "{:.4f}%".format(svm_accuracy_grid * 100), "{:.4f}%".format(svm_precision_grid * 100),
     "{:.4f}%".format(svm_recall_grid * 100), "{:.4f}%".format(svm_f1_grid * 100), "{:.4f}%".format(svm_grid_specificity * 100)],
    ["Grid Naive Bayes", "{:.4f}%".format(nb_accuracy_grid * 100), "{:.4f}%".format(nb_precision_grid * 100),
     "{:.4f}%".format(nb_recall_grid * 100), "{:.4f}%".format(nb_f1_grid * 100), "{:.4f}%".format(nb_grid_specificity * 100)]
]

# Menentukan header tabel
headers = ["Model", "Accuracy", "Precision", "Recall", "F1 Score", "Specificity"]

# Mencetak tabel evaluasi
table = tabulate(data, headers, tablefmt="fancy_grid")
print(table)

╒═══════════════════════════════════╤════════════╤═════════════╤══════════╤════════════╤═══════════════╕
│ Model                             │ Accuracy   │ Precision   │ Recall   │ F1 Score   │ Specificity   │
╞═══════════════════════════════════╪════════════╪═════════════╪══════════╪════════════╪═══════════════╡
│ Support Vector Machine (SVM)      │ 98.4753%   │ 99.2537%    │ 89.2617% │ 93.9929%   │ 99.8965%      │
├───────────────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────────┤
│ Naive Bayes                       │ 98.3857%   │ 97.1223%    │ 90.6040% │ 93.7500%   │ 99.5859%      │
├───────────────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────────┤
│ Bayes Optimized SVM               │ 98.8341%   │ 99.2754%    │ 91.9463% │ 95.4704%   │ 99.8965%      │
├───────────────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────────┤
│ Bayes Optimized Naive Bayes       │ 98.4753%   │ 97.1