# JS09 - TUGAS 2

In [36]:
# LOAD DATA
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

file_id = "1ebBtW6kQiBgtQ4qbrzEeq45d6iZuPRGR"
url = f"https://drive.google.com/uc?id={file_id}"
df = pd.read_csv(url, encoding='latin-1')
df.head()

# Biasanya spam.csv memiliki kolom ekstra tak digunakan
df = df.rename(columns={df.columns[0]: "label", df.columns[1]: "text"})
df = df[["label", "text"]]

X = df["text"]
y = df["label"]

# COUNT VECTORIZER + STOP WORDS
cv = CountVectorizer(stop_words="english")
X_cv = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_cv, y, test_size=0.2, random_state=42
)

model_cv = MultinomialNB()
model_cv.fit(X_train, y_train)
pred_cv = model_cv.predict(X_test)

acc_cv = accuracy_score(y_test, pred_cv)
report_cv = classification_report(y_test, pred_cv)

print("===================================================")
print("   COUNT VECTORIZER + MULTINOMIAL NB")
print("===================================================")
print("Accuracy:", acc_cv)
print(report_cv)


# TF-IDF + STOP WORDS
tfidf = TfidfVectorizer(stop_words="english")
X_tfidf = tfidf.fit_transform(X)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train2, y_train2)
pred_tfidf = model_tfidf.predict(X_test2)

acc_tfidf = accuracy_score(y_test2, pred_tfidf)
report_tfidf = classification_report(y_test2, pred_tfidf)

print("===================================================")
print("      TF-IDF + MULTINOMIAL NB")
print("===================================================")
print("Accuracy:", acc_tfidf)
print(report_tfidf)


# PERBANDINGAN & KESIMPULAN
print("===================================================")
print("                 PERBANDINGAN AKURASI")
print("===================================================")
print("CountVectorizer Accuracy :", acc_cv)
print("TF-IDF Accuracy          :", acc_tfidf)

if acc_tfidf > acc_cv:
    print("\nKesimpulan: TF-IDF adalah fitur terbaik untuk kasus spam.csv")
else:
    print("\nKesimpulan: CountVectorizer adalah fitur terbaik untuk kasus spam.csv")


   COUNT VECTORIZER + MULTINOMIAL NB
Accuracy: 0.9802690582959641
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.93      0.93      0.93       150

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

      TF-IDF + MULTINOMIAL NB
Accuracy: 0.968609865470852
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115

                 PERBANDINGAN AKURASI
CountVectorizer Accuracy : 0.9802690582959641
TF-IDF Accuracy          : 0.968609865470852

Kesimpulan: CountVectorizer adalah fitur terbaik untuk kasus spam.csv
