In [2]:
# !unzip -o 'Ham_Spam.zip'

Archive:  Ham_Spam.zip
  inflating: spam.csv                


### Import all the important libraries

In [1]:
from keras.layers import SimpleRNN, Embedding, Dense, LSTM, Dropout
from keras.models import Sequential
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import the Data

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
texts = []
labels = []
for i, label in enumerate(data['v1']):
    texts.append(data['v2'][i])
    if label == 'ham':
        labels.append(0)
    else:
        labels.append(1)

In [4]:
texts = np.asarray(texts)
labels = np.asarray(labels)

In [5]:
max_features = 10000
maxlen = 50  #Truncate texts greater than length 50 & padding if less than 50

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=123)

In [7]:
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [8]:
X_train

array(['It took Mr owl 3 licks',
       "Well there's a pattern emerging of my friends telling me to drive up and come smoke with them and then telling me that I'm a weed fiend/make them smoke too much/impede their doing other things so you see how I'm hesitant",
       'Yes i thought so. Thanks.', ...,
       "Is it ok if I stay the night here? Xavier has a sleeping bag and I'm getting tired",
       'No need lar. Jus testing e phone card. Dunno network not gd i thk. Me waiting 4 my sis 2 finish bathing so i can bathe. Dun disturb u liao u cleaning ur room.',
       'I sent your maga that money yesterday oh.'], dtype='<U910')

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
sequences

[[14, 609, 800, 3875, 143, 2579],
 [125,
  413,
  4,
  3876,
  3877,
  15,
  11,
  245,
  941,
  10,
  2,
  576,
  42,
  7,
  57,
  577,
  38,
  192,
  7,
  56,
  941,
  10,
  21,
  31,
  4,
  1029,
  3878,
  134,
  192,
  577,
  128,
  126,
  3879,
  720,
  154,
  250,
  246,
  23,
  3,
  82,
  53,
  31,
  3880],
 [138, 1, 256, 23, 163],
 [178,
  13,
  97,
  144,
  179,
  4,
  578,
  639,
  863,
  142,
  20,
  414,
  1128,
  942,
  40,
  9,
  5,
  430,
  524,
  2,
  415,
  3,
  16,
  3881,
  943,
  337,
  3882,
  393],
 [1683, 79, 83, 1, 394, 352, 394, 442, 1, 395, 18, 322, 11, 120],
 [1129, 416, 377, 24, 49, 38, 3, 24, 282, 49, 38, 14],
 [12, 3883, 15, 3884, 38, 5, 15, 52, 21, 3885, 3, 32, 108, 1258, 17, 4, 3886],
 [3887, 40, 367, 29, 640, 47, 3888, 33, 864, 18, 57],
 [16,
  123,
  7,
  159,
  3,
  24,
  276,
  96,
  49,
  7,
  91,
  192,
  24,
  2,
  1684,
  10,
  55,
  40,
  49],
 [2580, 15, 2024],
 [72,
  18,
  34,
  1685,
  214,
  72,
  353,
  944,
  1686,
  193,
  945,
  18,
  3

In [10]:
tokenizer.word_index

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'u': 6,
 'and': 7,
 'in': 8,
 'is': 9,
 'me': 10,
 'my': 11,
 'for': 12,
 'your': 13,
 'it': 14,
 'of': 15,
 'call': 16,
 'have': 17,
 '2': 18,
 'now': 19,
 'on': 20,
 'that': 21,
 'are': 22,
 'so': 23,
 'not': 24,
 'can': 25,
 'but': 26,
 'or': 27,
 'do': 28,
 'at': 29,
 'if': 30,
 "i'm": 31,
 'just': 32,
 'ur': 33,
 'get': 34,
 'will': 35,
 'be': 36,
 'no': 37,
 'with': 38,
 'we': 39,
 'this': 40,
 '4': 41,
 'up': 42,
 'gt': 43,
 'lt': 44,
 'when': 45,
 'free': 46,
 'from': 47,
 'go': 48,
 'ok': 49,
 'know': 50,
 'what': 51,
 'all': 52,
 'how': 53,
 'out': 54,
 'like': 55,
 'then': 56,
 'come': 57,
 'got': 58,
 'good': 59,
 'its': 60,
 'was': 61,
 'time': 62,
 'am': 63,
 'only': 64,
 'send': 65,
 'day': 66,
 'there': 67,
 'love': 68,
 'text': 69,
 'he': 70,
 'txt': 71,
 'want': 72,
 'going': 73,
 'as': 74,
 'about': 75,
 'home': 76,
 'by': 77,
 'stop': 78,
 'sorry': 79,
 'one': 80,
 "i'll": 81,
 'see': 82,
 'lor': 83,
 'still': 84,
 

In [11]:
word_index = tokenizer.word_index
print("Found {0} unique words: ".format(len(word_index)))

Found 7908 unique words: 


### Padding the Sequences

In [12]:
X_train = pad_sequences(sequences, maxlen=maxlen)

In [13]:
X_train.shape

(4457, 50)

### Create layers

In [14]:
model = Sequential()
model.add(Embedding(max_features, 32))   #Word(Input) Embeddings - Group same contexual words, 32 neurons
model.add(SimpleRNN(64))  #64 neurons/chains
model.add(Dropout(0.2))
model.add(Dense(1, activation='softmax'))
model.summary()

#10000 * 32 neurons = 320000
#(32 * 64 neurons) + 64 bias = 6208 
#(64 * 1 neuron) + 1 bias = 65

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          320000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                6208      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 326,273
Trainable params: 326,273
Non-trainable params: 0
_________________________________________________________________


### Compile the model

In [15]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

### Fit the model

In [16]:
hist_rnn = model.fit(X_train, y_train, batch_size=60, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
model = Sequential()
model.add(Embedding(max_features, 32))   #Word(Input) Embeddings - Group same contexual words, 32 neurons
model.add(SimpleRNN(512))  #512 neurons/chains
model.add(Dropout(0.2))
model.add(Dense(1, activation='softmax'))
model.summary()

#10000 * 32 neurons = 320000
#(32 * 512 neurons) + 64 bias = 279040 
#(512 * 1 neuron) + 1 bias = 65

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 512)               279040    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 513       
                                                                 
Total params: 599,553
Trainable params: 599,553
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [19]:
hist_rnn = model.fit(X_train, y_train, batch_size=60, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
model = Sequential()
model.add(Embedding(max_features, 32))   #Word(Input) Embeddings - Group same contexual words, 32 neurons
model.add(SimpleRNN(512))  #512 neurons/chains
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

#10000 * 32 neurons = 320000
#(32 * 512 neurons) + 64 bias = 279040 
#(512 * 1 neuron) + 1 bias = 65

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 512)               279040    
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 513       
                                                                 
Total params: 599,553
Trainable params: 599,553
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [22]:
hist_rnn = model.fit(X_train, y_train, batch_size=60, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [23]:
model = Sequential()
model.add(Embedding(max_features, 32))   #Word(Input) Embeddings - Group same contexual words, 32 neurons
model.add(SimpleRNN(512))  #512 neurons/chains
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
model.summary()

#10000 * 32 neurons = 320000
#(32 * 512 neurons) + 64 bias = 279040 
#(512 * 1 neuron) + 1 bias = 65

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 512)               279040    
                                                                 
 dropout_3 (Dropout)         (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 513       
                                                                 
Total params: 599,553
Trainable params: 599,553
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [25]:
hist_rnn = model.fit(X_train, y_train, batch_size=60, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [26]:
tokenizer.fit_on_texts(X_test)
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences_test, maxlen=maxlen)

In [27]:
preds_rnn = model.predict(X_test)

In [28]:
preds_rnn

array([[1.2270044e-04],
       [9.9950528e-01],
       [1.4842620e-06],
       ...,
       [1.6630311e-08],
       [1.5487818e-09],
       [1.9976472e-09]], dtype=float32)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
acc_rnn = accuracy_score(y_test, preds_rnn)
confusion_rnn = confusion_matrix(y_test, preds_rnn)
clasrep_rnn = classification_report(y_test, preds_rnn)

print(acc_rnn)
print(confusion_rnn)
print(clasrep_rnn)