### Nomor 1

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('../docs/voice.csv')

X = data.drop('label', axis=1)  # Asumsi 'label' adalah target
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svm_model = SVC(kernel='linear')  # Kamu bisa coba kernel lain seperti 'rbf', 'poly'
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))
print('Akurasi:', accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

      female       0.96      0.99      0.98       297
        male       0.99      0.97      0.98       337

    accuracy                           0.98       634
   macro avg       0.98      0.98      0.98       634
weighted avg       0.98      0.98      0.98       634

Akurasi: 0.9763406940063092


### Nomor 2

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('../docs/spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]  # Asumsi 'v1' adalah label dan 'v2' adalah pesan teks
data.columns = ['label', 'text']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

X = data['text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

y_pred = nb_model.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print('Akurasi:', accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Akurasi: 0.9838565022421525


### Nomor 3

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_tfidf))
print('Akurasi:', accuracy_score(y_test, y_pred_tfidf))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Akurasi: 0.9668161434977578


Berdasarkan hasil evaluasi, model Multinomial Naive Bayes dengan CountVectorizer memberikan performa yang lebih baik secara keseluruhan, terutama dalam hal akurasi, recall, dan F1-Score. Meski precision untuk label spam sedikit lebih rendah, model dengan CountVectorizer lebih sensitif dan seimbang dalam mendeteksi pesan spam. Oleh karena itu, CountVectorizer lebih efektif untuk klasifikasi data dalam kasus ini dibandingkan TF-IDF