### Library

In [1]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\quanb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\quanb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Dataset

In [2]:
df = pd.read_csv("./2cls_spam_text_cls.csv")

messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

### Preprocessing

In [3]:
def lower_case(text):
    return text.lower()

def punctuation_removal(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    return [word for word in text if word not in stopwords]

def stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

def preprocess(text):
    text = lower_case(text)
    text = punctuation_removal(text)
    text = tokenize(text)
    text = remove_stopwords(text)

    return text

messages = [preprocess(message) for message in messages]
print(messages)



In [4]:
def create_dictionary(messages):
    dictionary = []
    for message in messages:
        for word in message:
            if word not in dictionary:
                dictionary.append(word)
    return dictionary

dictionary = create_dictionary(messages)
print(dictionary)



In [5]:
def create_features(messages, dictionary):
    features = np.zeros(len(dictionary))
    
    for word in messages:
        if word in dictionary:
            features[dictionary.index(word)] += 1

    return features

features = [create_features(message, dictionary) for message in messages]
print(features)

[array([1., 1., 1., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([1., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 

In [6]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes : {le.classes_}")
print(f"Encoded labels : {y}")

Classes : ['ham' 'spam']
Encoded labels : [0 0 1 ... 0 0 0]


### Train, Val and Test

In [7]:
val_size = 0.2
test_size = 0.125
seed = 0

x_train, x_val, y_train, y_val = train_test_split(features, y, 
                                                test_size=val_size,
                                                shuffle=True, 
                                                random_state=seed)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,
                                                test_size=test_size,
                                                shuffle=True,
                                                random_state=seed)

### Train Model

In [9]:
model = GaussianNB()
print("Start training")
model = model.fit(x_train, y_train)
print("Training done")

Start training
Training done


### Accuracy Score

In [10]:
y_val_pred = model.predict(x_val)
y_test_pred = model.predict(x_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Validation accuracy : {val_accuracy}")
print(f"Test accuracy : {test_accuracy}")

Validation accuracy : 0.8968609865470852
Test accuracy : 0.8960573476702509


### Prediction

In [12]:
def predict(message, model, dictionary):
    message = preprocess(message)
    features = create_features(message, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    
    return le.inverse_transform(prediction)[0]

test_input = "Congratulations! You have been selected as a winner. Text WON to 44255 to claim your prize."
print(f"Prediction: {predict(test_input, model, dictionary)}")

Prediction: spam
