In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('spam.csv', encoding='latin-1') # spesifiksi encoding diperlukan karena data tidak menggunakan UTF-8

df.head()

In [None]:
# Preprocessing data (bersihkan kolom-kolom yang tidak diperlukan)
df = df[['v1', 'v2']]
df = df.rename(columns={'v1': 'label', 'v2': 'text'})

In [None]:
# Bagi data menjadi data latih dan data uji
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Inisialisasi TF-IDF Vectorizer dengan stop_words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Transformasi teks menjadi vektor fitur TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Inisialisasi model Multinomial Naive Bayes
naive_bayes = MultinomialNB()

In [None]:
# Latih model pada data pelatihan
naive_bayes.fit(X_train_tfidf, y_train)

# Prediksi label pada data pengujian
y_pred = naive_bayes.predict(X_test_tfidf)

In [None]:
# Evaluasi hasil
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)


In [None]:
# Menampilkan hasil
print("Akurasi:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

In [None]:
# Lakukan langkah-langkah pemodelan dengan CountVectorizer seperti yang telah Anda lakukan pada Tugas no 2
# Hitung akurasi, matriks kebingungan, dan laporan klasifikasi

# Contoh:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Inisialisasi CountVectorizer dan model Multinomial Naive Bayes
count_vectorizer = CountVectorizer(stop_words='english')
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)
naive_bayes_count = MultinomialNB()
naive_bayes_count.fit(X_train_count, y_train)
y_pred_count = naive_bayes_count.predict(X_test_count)

# Evaluasi hasil
accuracy_count = accuracy_score(y_test, y_pred_count)
conf_matrix_count = confusion_matrix(y_test, y_pred_count)
class_report_count = classification_report(y_test, y_pred_count)

# Tampilkan hasil
print("Evaluasi dengan CountVectorizer:")
print("Akurasi:", accuracy_count)
print("Confusion Matrix:\n", conf_matrix_count)
print("Classification Report:\n", class_report_count)


In [None]:
# Lakukan langkah-langkah pemodelan dengan TF-IDF Vectorizer seperti yang telah Anda lakukan pada Tugas no 3
# Hitung akurasi, matriks kebingungan, dan laporan klasifikasi

# Contoh:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Inisialisasi TF-IDF Vectorizer dan model Multinomial Naive Bayes
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
naive_bayes_tfidf = MultinomialNB()
naive_bayes_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = naive_bayes_tfidf.predict(X_test_tfidf)

# Evaluasi hasil
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
conf_matrix_tfidf = confusion_matrix(y_test, y_pred_tfidf)
class_report_tfidf = classification_report(y_test, y_pred_tfidf)

# Tampilkan hasil
print("Evaluasi dengan TF-IDF Vectorizer:")
print("Akurasi:", accuracy_tfidf)
print("Confusion Matrix:\n", conf_matrix_tfidf)
print("Classification Report:\n", class_report_tfidf)
