In [None]:
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 12.7MB/s]


In [None]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [24]:
dataset_path = "/content/2cls_spam_text_cls.csv"
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [25]:
messages = df["Message"].values.tolist()
labes = df["Category"].values.tolist()

In [26]:
# Convert all texts to lowercase
# Convert all texts to lowercase
def lowercase(text):
    return text.lower()  # Add parentheses to call the function

# Remove punctuation
def punctuation_removal(text):
    translator =  str.maketrans("", "", string.punctuation)
    return text.translate(translator)

# Split the texts into individual words (tokens)
def tokenize(text):
    return nltk.word_tokenize(text)

# Remove stopwords
def remove_stopwords(tokens):
    stopwords = nltk.corpus.stopwords.words("english")
    return [token for token in tokens if token not in stopwords]

# Reduce words to their root form, grouping similar words together
def stemming(tokens):
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(token) for token in tokens]


In [27]:
# Preprocessing data
def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)
    return tokens

messages = [preprocess_text(message) for message in messages]

In [28]:
# Create dictionary
def create_dictionary(messages):
    dictionary = []

    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

dictionary = create_dictionary(messages)

In [29]:
# Create fetures
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [30]:
# Preprocessing the labels
le = LabelEncoder()
y = le.fit_transform(labes)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [31]:
# Split dataset into train/val/test
val_size = 0.2
test_size = 0.125
seed = 0
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=seed, shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X_val, y_val, test_size=test_size, random_state=seed, shuffle=True)

In [32]:
# Training model
model = GaussianNB()
print("Start training")

model.fit(X_train, y_train)
print("Training done")

Start training
Training done


In [33]:
# Validate the model
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Validation accuracy: {val_accuracy}")
print(f"Test accuracy: {test_accuracy}")

Validation accuracy: 0.9623318385650225
Test accuracy: 0.8857142857142857


In [35]:
# Make a prediction
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)
    return prediction_cls

test_input = "You win a giveaway of 1000$"
prediction_cls = predict(test_input, model, dictionary)
print(f"Prediction: {prediction_cls}")

Prediction: ['spam']
