In [2]:
from keras.layers import SimpleRNN, LSTM, GRU, Embedding, Dense, Flatten
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.utils import plot_model

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

In [6]:
def main(rnn_model):
    def message_to_array(msg):
        msg = msg.lower().split(' ')
        test_seq = np.array([word_index[word] for word in msg])

        test_seq = np.pad(test_seq, (500-len(test_seq), 0), 'constant', constant_values=(0))
        test_seq = test_seq.reshape(1, 500)
        return test_seq

    df = pd.read_csv("spam.csv", encoding="ISO-8859-1",usecols=['v1', 'v2'])
    df.rename(columns = {'v1':'label', 'v2':'text'}, inplace=True)
    df.head()
    df.info()
    df.shape
    

    print('The number of ham messages in the dataset is {}'.format(df['label'].value_counts()[0]))
    print('The number of spam messages in the dataset is {}'.format(df['label'].value_counts()[1]))      

    messages = []
    labels = []
    for index, row in df.iterrows():
        messages.append(row['text'])
        if row['label'] == 'ham':
            labels.append(0)
        else:
            labels.append(1)

    messages = np.asarray(messages)
    labels = np.asarray(labels)

    print("Number of messages: ", len(messages))
    print("Number of labels: ", len(labels))

    max_vocab = 10000
    max_len = 500

    # Ignore all words except the 10000 most common words
    tokenizer = Tokenizer(num_words=max_vocab)
    # Calculate the frequency of words
    tokenizer.fit_on_texts(messages)
    # Convert array of messages to list of sequences of integers
    sequences = tokenizer.texts_to_sequences(messages)

    # Dict keeping track of words to integer index
    word_index = tokenizer.word_index

    # Convert the array of sequences(of integers) to 2D array with padding
    # maxlen specifies the maximum length of sequence (truncated if longer, padded if shorter)
    df = pad_sequences(sequences, maxlen=max_len)

    print("data shape: ", df.shape)

    # We will use 70% of data for training & validation(70% train, 30% validation) and 30% for testing
    train_samples = int(len(messages)*0.7)

    messages_train = df[:train_samples]
    labels_train = labels[:train_samples]

    messages_test = df[train_samples:len(messages)-2]
    labels_test = labels[train_samples:len(messages)-2]

    embedding_mat_columns=50
    # Construct the SimpleRNN model
    model = Sequential()
    ## Add embedding layer to convert integer encoding to word embeddings(the model learns the
    ## embedding matrix during training), embedding matrix has max_vocab as no. of rows and chosen
    ## no. of columns
    model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))

    if rnn_model == 'SimpleRNN':
        model.add(SimpleRNN(units=embedding_mat_columns))
    elif rnn_model == 'LSTM':
        model.add(LSTM(units=embedding_mat_columns))
    # else:
    #     model.add(GRU(units=embedding_mat_columns))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    model.summary()

    #plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

    # Training the model
    model.fit(messages_train, labels_train, epochs=8, batch_size=500, validation_split=0.3)

    # Testing the model
    predict_x=model.predict(messages_test) 
    classes_x=np.argmax(predict_x,axis=1)
    acc = model.evaluate(messages_test, labels_test)
    print("Test loss is {0:.2f}, Accuracy is {1:.2f}  ".format(acc[0],acc[1]))

    # Constructing a custom message to check model
    custom_msg = 'Congratulations ur awarded 500 of CD vouchers or 125gift guaranteed Free entry for movies'
    test_seq = message_to_array(custom_msg)
    #pred = model.predict_classes(test_seq)
    predict_x=model.predict(test_seq) 
    classes_x=np.argmax(predict_x,axis=1)
    print(classes_x)

    sms_test = ['Hi Paul, would you come around tonight']
    print("Test sms 1: ",sms_test)
    sms_seq = tokenizer.texts_to_sequences(sms_test)

    sms_pad = pad_sequences(sms_seq, maxlen=max_len, padding='post')
    tokenizer.index_word
    sms_pad
    predict_1=model.predict(sms_pad) 
    classes_1=np.argmax(predict_1,axis=1)
    if(classes_1==[0]):
        print("Ham")
    else:
        print("Spam")

    sms_test = ['Free SMS service for anyone']
    print("Test sms 2: ",sms_test)
    sms_seq = tokenizer.texts_to_sequences(sms_test)

    sms_pad = pad_sequences(sms_seq, maxlen=max_len, padding='post')
    tokenizer.index_word
    sms_pad
    predict_2=model.predict(sms_pad) 
    classes_2=np.argmax(predict_2,axis=1)
    if(classes_2==[0]):
        print("Ham")
    else:
        print("Spam")

In [7]:
if __name__ == '__main__':
    main('SimpleRNN')
    #main('LSTM')
   # main('GRU')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
The number of ham messages in the dataset is 4825
The number of spam messages in the dataset is 747
Number of messages:  5572
Number of labels:  5572
data shape:  (5572, 500)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 50)           500000    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 50)                5050      
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                  