In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#### Reading in the data

In [2]:
text_message = pd.read_csv('SPAM_text_message.csv')
print('Shape: ',text_message.shape)

Shape:  (5572, 2)


In [3]:
text_message.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
text_message['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

#### Breaking the Input data into Text and Label

In [5]:
texts = []
labels = []
for i, label in enumerate(text_message['Category']):
    texts.append(text_message['Message'][i])
    if label == 'ham':
        labels.append(0)
    else:
        labels.append(1)

texts = np.asarray(texts)
labels = np.asarray(labels)

print("number of texts :" , len(texts))
print("number of labels: ", len(labels))

number of texts : 5572
number of labels:  5572


In [6]:
print(labels[0:5])
print('------')
print(texts[0:5])

[0 0 1 0 0]
------
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"]


In [28]:
import tensorflow as tf
print(tensorflow.__version__)
import keras
print(keras.__version__)

2.16.1
3.2.1


#### Pre-Processing

In [29]:
from keras.layers import SimpleRNN, Embedding, Dense
from keras.models import Sequential

from keras.preprocessing.sequence import pad_sequences

##### Converting into sequence of Tokens

In [30]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print("Found {0} unique words: ".format(len(word_index)))

Found 9004 unique words: 


##### Padding the sequences

In [31]:
# number of words used as features
max_features = 10000
# cut off the words after seeing 500 words in each document(email)
maxlen = 500

data = pad_sequences(sequences, maxlen=maxlen)

print("data shape: ", data.shape)

data shape:  (5572, 500)


#### Creating the Train and Test Set

In [32]:
np.random.seed(42)
# shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [33]:
# we will use 80% of data as training, 20% as validation data
training_samples = int(text_message.shape[0] * .8)
validation_samples = int(text_message.shape[0] - training_samples)
# sanity check
print(len(texts) == (training_samples + validation_samples))
print("Training: {0},   Validation {1} ".format(training_samples, validation_samples))

True
Training: 4457,   Validation 1115 


In [34]:
texts_train = data[:training_samples]
y_train = labels[:training_samples]
texts_test = data[training_samples:]
y_test = labels[training_samples:]

#### Building a SimpleRNN model

In [35]:
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model_rnn = model.fit(texts_train, y_train, epochs=10, batch_size=60, validation_split=0.2)

Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 140ms/step - acc: 0.8398 - loss: 0.4001 - val_acc: 0.9619 - val_loss: 0.1336
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 117ms/step - acc: 0.9802 - loss: 0.0937 - val_acc: 0.9832 - val_loss: 0.0592
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 124ms/step - acc: 0.9870 - loss: 0.0523 - val_acc: 0.9832 - val_loss: 0.0579
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 120ms/step - acc: 0.9927 - loss: 0.0293 - val_acc: 0.9888 - val_loss: 0.0402
Epoch 5/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 133ms/step - acc: 0.9942 - loss: 0.0224 - val_acc: 0.9821 - val_loss: 0.0612
Epoch 6/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 120ms/step - acc: 0.9948 - loss: 0.0224 - val_acc: 0.9854 - val_loss: 0.0531
Epoch 7/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 122ms/step 

#### Testing the model with the Test data set

In [40]:
pred = model.predict(texts_test)
acc = model.evaluate(texts_test, y_test)
print("Test loss is {0:.2f} accuracy is {1:.2f}  ".format(acc[0],acc[1]))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - acc: 0.9824 - loss: 0.0601
Test loss is 0.07 accuracy is 0.98  
