<a href="https://colab.research.google.com/github/btduyforwork/spam_classification/blob/main/Project_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Tải bộ dữ liệu


In [1]:
# https://drive.google.com/file/d/1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R/view?usp=sharing
!gdown 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
  0% 0.00/486k [00:00<?, ?B/s]100% 486k/486k [00:00<00:00, 93.5MB/s]


## 1. Import các thư viện cần thiết

In [2]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## 2. Đọc và tách dữ liệu

In [3]:
DATASET_PATH="/content/2cls_spam_text_cls.csv"
df=pd.read_csv(DATASET_PATH)
labels=df["Category"].values.tolist()
messages=df["Message"].values.tolist()

In [4]:
labels[0:10]

['ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam']

### 2.1. Xử lý dữ liệu nhãn

In [5]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


## 3. Tiền xử lý dữ liệu

In [6]:
def lowercase(text):
    return text.lower()

def punctuation_removal(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

def stemming(tokens):
    stemmer=nltk.stem.PorterStemmer()
    stem_tokens=[stemmer.stem(word) for word in tokens]
    return stem_tokens

def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)

    return tokens
# preprocess_text("The cat's toys are scattered everywhere!")

In [7]:
pro_messages = [preprocess_text(message) for message in messages]
pro_messages[0:5]

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni'],
 ['free',
  'entri',
  '2',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  '08452810075over18'],
 ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'],
 ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though']]

## 4. Create a dictionary & Features

In [8]:
def create_dictionary(messages):
  dictionary=[]
  for tokens in messages:
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)
  return dictionary

def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))
  for token in tokens:
    if token in dictionary:
      index=dictionary.index(token)
      features[index]+=1
  return features

In [9]:
dictionary = create_dictionary(pro_messages)
X=[create_features(tokens,dictionary)for tokens in pro_messages]
len(X)
len(y)

5572

## 5. Chia dữ liệu train/val/test

> Add blockquote



In [10]:
TRAIN_SIZE = 0.7
VAL_SIZE = 1/3
SEED = 0
X_val, X_train, y_val, y_train = train_test_split(
    X,
    y,
    test_size=TRAIN_SIZE,
    shuffle=True,
    random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val,
    y_val,
    test_size=VAL_SIZE,
    shuffle=True,
    random_state=SEED
)

In [11]:
len(X_test)

557

## 6. Huấn luyện mô hình


In [12]:
%%time
model=MultinomialNB()
print("Start training ... ")
model.fit(X_train,y_train)
print("Training Completed !!!")


Start training ... 
Training Completed !!!
CPU times: user 161 ms, sys: 57.7 ms, total: 219 ms
Wall time: 191 ms


## 7. Đánh giá


In [13]:
X_val_prediction=model.predict(X_val)
X_test_prediction=model.predict(X_test)
print(np.array(X_val))
val_accuracy=accuracy_score(y_val,X_val_prediction)
test_accuracy=accuracy_score(y_test,X_test_prediction)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Test Accuracy: {test_accuracy}')


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Validation Accuracy: 0.9793536804308797
Test Accuracy: 0.9694793536804309


## 7. Prediction

In [46]:
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1,-1)
    prediction = model.predict(features)
    prediction = le.inverse_transform(prediction)[0]
    return str(prediction)

In [47]:
test_input = 'I am actually thinking a way of doing something useful'
prediction = predict(test_input, model, dictionary)
prediction

'ham'