In [12]:
# https://drive.google.com/file/d/1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R/view?usp=sharing
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: e:\AIO Việt Nam\Project Module 2\2cls_spam_text_cls.csv

  0%|          | 0.00/486k [00:00<?, ?B/s]
100%|██████████| 486k/486k [00:00<00:00, 3.45MB/s]
100%|██████████| 486k/486k [00:00<00:00, 3.41MB/s]


In [13]:
import string
import nltk
nltk.download("stopwords")
nltk.download("punkt")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to C:\Users\msi
[nltk_data]     laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\msi
[nltk_data]     laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
DATASET_PATH = "2cls_spam_text_cls.csv"
df = pd.read_csv(DATASET_PATH)
print(df.head())
messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [15]:
def lowercase(text):
    return text.lower()

def punctuation_removal(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    return [token for token in tokens if token not in stop_words]

def stemming(tokens):
    stemmer = nltk.stem.PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)
    return " ".join(tokens)  # Trả về chuỗi, không phải list

messages = [preprocess_text(message) for message in messages]


In [16]:
def create_dictionary(messages):
    dictionary = []
    for tokens in messages:
        for token in tokens.split():
            if token not in dictionary:
                dictionary.append(token)
    return dictionary
dictionary = create_dictionary(messages)

In [17]:
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))
    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1
    return features
X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [18]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")


Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [19]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125 # 0.1 / (1-0.2)
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                 test_size=VAL_SIZE,
                                                 shuffle=True,
                                                 random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
test_size=TEST_SIZE,
shuffle=True,
random_state=SEED)

In [20]:
model = GaussianNB()
print("Start training...")
model = model.fit(X_train, y_train)
print("Training completed.")

Start training...
Training completed.


In [21]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Val accuracy: {val_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")

import pickle

# Lưu model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Lưu dictionary
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(dictionary, f)

# Lưu label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

Val accuracy: 0.7354
Test accuracy: 0.7348


In [31]:
def predict(text, model, dictionary, label_encoder):
    processed_text = preprocess_text(text)
    features = create_features(processed_text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = label_encoder.inverse_transform(prediction)[0]
    return prediction_cls

test_input = "Do u wanna buy a IceCream of this company, Do you want to buy this product?"
prediction_cls = predict(test_input, model, dictionary, le)
print(f"Prediction: {prediction_cls}")


Prediction: spam
