# Preface

The notebook is inspired by [this](https://towardsdatascience.com/nlp-detecting-spam-messages-with-tensorflow-b12195b8cf0e) medium post.
Dataset: [Spam Text Message Classification](https://www.kaggle.com/datasets/team-ai/spam-text-message-classification).

Spam Dataset contains 2 text columns:
- Label: "spam" (87%) and "ham"(13%)
- Message

**Objective**: train an RNN model to detect if a message is spam or ham

In [None]:
# read .csv
!pip install pandas
# text preprocessing with Tokenizer
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
!pip install keras

In [4]:
from keras import __version__
__version__

'2.12.0'

In [6]:
from pandas import __version__
__version__

'2.0.0'

# Imports

In [1]:
import numpy as np
import pandas as pd

# Text preprocessing
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Model
from api.core.generic import Model
from api.core.layers import Embedding, Flatten, Dense
from api.core.optimizers import GradientDescent, Adam

# Data
from api.core.preprocessing.samplers import train_test_split

# Configuration

In [2]:
vocab_size = 1000
embedding_dim = 16
max_length = 100

trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Read and preprocess data

In [3]:
dataset = pd.read_csv('data/spam.csv')

sentences = dataset['Message'].to_numpy()
# .replace(0, -1) in case of Hinge loss
labels = dataset['Category'].astype("category").cat.codes.values

dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
x_train, x_test, y_train, y_test = train_test_split(sentences, labels, split=0.8)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [6]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)

x_train = np.expand_dims(x_train, 1)
x_test = np.expand_dims(x_test, 1)
y_train = np.expand_dims(y_train, [1, 2])
y_test = np.expand_dims(y_test, [1, 2])

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4457, 1, 100), (1115, 1, 100), (4457, 1, 1), (1115, 1, 1))

# Model

In [29]:
model = Model(input_shape=(1, max_length))

model.add(Embedding(vocab_size, embedding_dim, max_length, weight_initializer='xavier_uniform'))
model.add(Flatten())
model.add(Dense(2, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

model.compile(Adam(0.001, clipvalue=10), 'binary_cross_entropy')

model.fit(x_train, y_train, validation_split=0.2, batch_size=200, epochs=15)

  0%|          | 0/15 [00:00<?, ?it/s]

In [30]:
model.predict(x_test, y_test).shape

  0%|          | 0/1 [00:00<?, ?it/s]

(1115, 1, 1)