#
<h6 style='background-color: tomato; font-family:newtimeroman; font-size:300%; text-align:center; border-radius: 15px 50px;'>SMS SPAM DETECTION USING TENSORFLOW </h6>

### IMPORTING LIBRARIES

In [1]:
import numpy as np # for scientific computing in python
import time
import pickle   # serializing and deserializing a Python object structure.

# importing for data or text_preprocessing
import tensorflow as tf
import tqdm #for processing the data
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

# splitting the dataset
from sklearn.model_selection import train_test_split

#for creating the model in which by using LSTM
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Recall, Precision


#for allocating the gpu
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # only use GPU memory that we need, not allocate all the GPU memory
    tf.config.experimental.set_memory_growth(gpus[0], enable=True)


## Load Dataset

In [3]:
# load the data
X, y = load_data()

In [2]:
def load_data():
    """
    Loads SMS Spam Collection dataset
    """
    t, l = [], []
    with open("/content/SMSSpamCollection") as f:
        for line in f:
            split = line.split()
            l.append(split[0].strip())
            t.append(' '.join(split[1:]).strip())
    return t, l

In [4]:
print(X) #It prints the text data that includes spam and ham(legitimate) messages


In [None]:
print(y) #It prtints the corresponding X messages labels either spam or ham

## SOME PARAMETERS

In [7]:
SL = 100             # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors (global vectors for word representation.)


BATCH_SIZE = 64
EPOCHS = 10      # number of epochs

# labelling the dataset as ham as 0 and spam as 1
label2int = {"ham": 0, "spam": 1}
int2label = {0: "ham", 1: "spam"}

## PREPARING DATASET

In [8]:
# Text tokenization
# vectorizing text, turning each text into sequence of integers


tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

# lets dump it to a file, so we can use it in testing
pickle.dump(tokenizer, open("tokenizer.pickle", "wb"))


# convert to sequence of integers
X = tokenizer.texts_to_sequences(X)

In [None]:
print(X[0])

[49, 472, 4436, 843, 756, 659, 64, 8, 1328, 87, 123, 352, 1329, 148, 2996, 1330, 67, 58, 4437, 144]


In [None]:
print(X[2])

[47, 490, 8, 19, 4, 798, 902, 2, 176, 1942, 1106, 660, 1943, 2331, 261, 2332, 71, 1942, 2, 1944, 2, 338, 490, 556, 961, 73, 392, 174, 661, 393, 2997]


In [10]:
# convert to numpy arrays
#X = np.array(X)
y = np.array(y)


# pad sequences at the beginning of each sequence with 0's
# for example if SEQUENCE_LENGTH=4:
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
# will be transformed to:
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]

X = pad_sequences(X, maxlen=SL)

In [None]:
print(X[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0   49  472 4436  843
  756  659   64    8 1328   87  123  352 1329  148 2996 1330   67   58
 4437  144]


In [11]:
#print(X[4])

In [None]:
# One Hot encoding labels
# [spam, ham, spam, ham, ham] will be converted to:
# [1, 0, 1, 0, 1] and then to:
# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]

y = [ label2int[label] for label in y ]
y = to_categorical(y)


In [None]:
print(y)


[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


## Test train split

In [None]:
# split and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)

# print our data shapes
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)
print("y_train.shape:", y_train.shape)
print("y_test.shape:", y_test.shape)

X_train.shape: (4180, 100)
X_test.shape: (1394, 100)
y_train.shape: (4180, 2)
y_test.shape: (1394, 2)


## APPLYING A MODEL

In [None]:
def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [None]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SL))

    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", tf.keras.metrics.PrecisionAtRecall(
    recall=0.5, num_thresholds=200, class_id=None, name=None, dtype=None
),tf.keras.metrics.PrecisionAtRecall(
    recall=0.5, num_thresholds=200, class_id=None, name=None, dtype=None
)])
    model.summary()
    return model

In [None]:
# constructs the model with 128 LSTM units
model = get_model(tokenizer=tokenizer, lstm_units=128)



Reading GloVe: 400000it [00:07, 50948.94it/s]


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          901300    
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 258       
                                                                 
Total params: 1,018,806
Trainable params: 117,506
Non-trainable params: 901,300
_________________________________________________________________


# Training a model

In [None]:
from sklearn import metrics
import time

In [None]:
# initialize our ModelCheckpoint and TensorBoard callbacks
# model checkpoint for saving best weights
model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}.h5", save_best_only=True,
                                    verbose=1)
# train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          batch_size=BATCH_SIZE, epochs=EPOCHS,
          callbacks=[tensorboard, model_checkpoint],
          verbose=1)

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.17644, saving model to results\spam_classifier_0.18.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.17644 to 0.08186, saving model to results\spam_classifier_0.08.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.08186 to 0.06741, saving model to results\spam_classifier_0.07.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.06741
Epoch 5/10
Epoch 5: val_loss improved from 0.06741 to 0.06737, saving model to results\spam_classifier_0.07.h5
Epoch 6/10
Epoch 6: val_loss did not improve from 0.06737
Epoch 7/10
Epoch 7: val_loss improved from 0.06737 to 0.06736, saving model to results\spam_classifier_0.07.h5
Epoch 8/10
Epoch 8: val_loss did not improve from 0.06736
Epoch 9/10
Epoch 9: val_loss did not improve from 0.06736
Epoch 10/10
Epoch 10: val_loss improved from 0.06736 to 0.06213, saving model to results\spam_classifier_0.06.h5


<keras.callbacks.History at 0x24916129ca0>

# Evaluating a model

In [None]:
# get the loss and metrics
result = model.evaluate(X_test, y_test)

# extract those
loss = result[0]
accuracy = result[1]
precision = result[2]
recall = result[3]

print(f"[+] Accuracy: {accuracy*100:.2f}%")
print(f"[+] Precision:   {precision*100:.2f}%")
print(f"[+] Recall:   {recall*100:.2f}%")

[+] Accuracy: 98.35%
[+] Precision:   99.59%
[+] Recall:   99.59%


# Testing the model

In [None]:
def get_predictions(text):
    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=SL)
    # get the prediction
    prediction = model.predict(sequence)[0]
    # either 0 or 1
    # one-hot encoded vector, revert using np.argmax
    return int2label[np.argmax(prediction)]

In [None]:
text = "You won a prize of 1,000$, click here to claim!"
get_predictions(text)



'spam'

In [None]:
text = "Hi man, I was wondering if we can meet tomorrow."
print(get_predictions(text))

ham


In [None]:
text=input()
get_predictions(text)

You won a prize of 1,000$, click here to claim!


'spam'