In [1]:
# Import pustaka pandas untuk mengelola data dalam format DataFrame
import pandas as pd

# Membaca dataset dari file spam.csv
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [2]:
# Menampilkan lima baris pertama dataset untuk verifikasi
print(df.head())

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
# Memilih kolom yang relevan untuk dianalisis
df = df[['v1', 'v2']] # Pilih hanya kolom label dan teks

# Mengganti nama kolom untuk kejelasan
df.columns = ['label', 'text']

In [4]:
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [5]:
# Import pustaka yang diperlukan untuk pemrosesan teks
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

# Mengunduh data yang diperlukan dari nltk
nltk.download('punkt')
nltk.download('stopwords')

# Membuat objek stemming dan stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Fungsi untuk memproses teks
def preprocess_text(text):
    # Hapus karakter non-alfabet
    text = re.sub('[^A-Za-z]', ' ', text)
    
    # Ubah teks menjadi huruf kecil
    text = text.lower()
    
    # Tokenisasi teks
    tokens = word_tokenize(text)
    
    # Hapus stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming pada setiap kata
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Gabungkan kembali kata-kata menjadi teks
    return " ".join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\achma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\achma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Import CountVectorizer untuk representasi teks dalam bentuk Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

# Menyimpan kolom teks yang telah diproses ke dalam variabel corpus
corpus = df['text']

# Membuat objek CountVectorizer dengan batas maksimum fitur
vectorizer = CountVectorizer(max_features=1000)

# Mentransformasi teks menjadi representasi numerik
x = vectorizer.fit_transform(corpus).toarray()

# Mengubah label menjadi angka (ham = 0, spam =1)
y = df['label'].map({'ham': 0, 'spam': 1}).values

In [7]:
# Import fungsi untuk membagi dataset menjadi data latih dan uji
from sklearn.model_selection import train_test_split

# Membagi dataset dengan proporsi 75% Latih dan 25% Uji
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [8]:
# Import model Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

# Membuat objek model
classifier = GaussianNB()

# Melatih model menggunakan data latih
classifier.fit(x_train, y_train)

In [9]:
# Import fungsi untuk menghitung akurasi
from sklearn.metrics import accuracy_score

# Memprediksi data uji
y_pred = classifier.predict(x_test)

# Menghitung akurasi prediksi
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 79.90%


In [10]:
# Teks baru yang ingin diprediksi
new_sms = "Go until jurong point, crazy.. Available only"

# Memproses teks baru menggunakan fungsi preprocess_text
new_sms_processed = preprocess_text(new_sms)

# Mentransformasi teks baru menjadi representasi Bag of Words
new_vector = vectorizer.transform([new_sms_processed]).toarray()

# Memprediksi label teks baru
prediction = classifier.predict(new_vector)

# Menampilkan hasil prediksi
print("Spam" if prediction[0] == 1 else "Not Spam")

NameError: name 'text_lower' is not defined