In [1]:
import pandas as pd
import seaborn as sns
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential
from sklearn.metrics import confusion_matrix
from utility import plot_model_history, execute_grid_search
from keras.api.callbacks import EarlyStopping

# defining parameters
num_words = 3000  #this means we can consider only the top  3,000 most frequent words


## Data preparation 
- import data and examine the data
- Remove inconsistencies, irrelevant information
- Correct inconsistent formating

In [2]:
email_datasets = pd.read_csv('test_data.csv')

email_datasets.describe()

In [3]:
print(email_datasets.head())
print(email_datasets.isnull().sum())

In [4]:
# rename Category to category & Message to message for consistency between labels features

email_datasets.columns = ['category', 'message']

print(email_datasets.head())

In [5]:
# converting  all spam to ham and all upper case to lower case in the texts so that preprocessing can occur correctly

email_datasets.category = email_datasets.category.apply(lambda x: 1 if x == 'spam' else 0)
email_datasets.message = email_datasets.message.apply(lambda x: x.lower())
print(email_datasets.head())

In [6]:
# the next step is preprocessing and Tokenizing the data
# this step Convert text into a sequence of tokens (numerical format) that the model can understand 

tokenizer = Tokenizer(num_words)

# Updates internal vocabulary with the words in the sequence memory
tokenizer.fit_on_texts(email_datasets.message) 

# here we are converting to a sequence of numbers
dataset_sequence = tokenizer.texts_to_sequences(email_datasets.message)


In [7]:
# determining how to pad the letters so input will be equal
# since I have chosen to find the most common highest value, with that value all sequence not up to that length will be padded
sequence_lengths = [len(seq) for seq in dataset_sequence]

sns.histplot(sequence_lengths, label='Message')

# since the majority of the sequence length words falls within 10 we will pad all the sequences less than 10 

In [8]:
padded_sequences = pad_sequences(dataset_sequence, maxlen=20)


Training

we now have the data in the right format, we can now split the data into training and testing set and since we will be using RNN we will be creating a function that will tune the model so we can take the best selection of hyperparameters, although this will be expensive in terms of time, but it will give us the best combination of hyperparameters that leads to the best model for this problem.

In [9]:
# split data into training and testing set using a ratio of 80:20
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, email_datasets.category, test_size=0.2)

## Base Model selection

In [10]:
# choosing combination of hyperparameters
# explanation of parameters 

# num of words
# output_dim 
# epochs
# batch_size
# validation_split
# activation_function
# optimizer

# embedding\_dim & rnn\_units & rnn\_type & optimizer & activation & accuracy & loss & precision & recall \\
# 100 & 64 & SimpleRNN & rmsprop & relu &  0.987444 & 0.061306 & 0.992701 & 0.912752 \\
# 50 & 64 & LSTM & rmsprop & relu & 0.982960 & 0.080875 & 0.949367 & 0.931677 \ \
# 50 & 32 & LSTM & rmsprop & relu &  0.989238 & 0.052878 & 0.987179 & 0.939024 \\

def lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=50))
    model.add(LSTM(32))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    # model.add(GRU(64, return_sequences = False))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model  


def gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=128))
    model.add(GRU(128))
    # model.add(GRU(64, return_sequences = False))
    model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    return model



def simple_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=50))
    model.add(SimpleRNN(32))
    # model.add(GRU(64, return_sequences = False))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

model = gru_model()
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy', 'precision', 'recall'])
# Display the model summary
model.summary()

# Train the model
history = model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(patience=3)])



In [11]:
loss, accuracy, precision, recall = model.evaluate(x_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')

In [12]:
plot_model_history(history)

In [13]:
new_emails = [
    "Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim now.",
    "Hey, are we still on for the meeting tomorrow?"
]

# Preprocess the new data
new_sequences = tokenizer.texts_to_sequences(new_emails)
new_padded = pad_sequences(new_sequences, maxlen=20)

# Make predictions
predictions_prob = model.predict(new_padded)  # Probabilities

predictions = [1 if prob > 0.5 else 0 for prob in predictions_prob]  # Binary classes

# Output the predictions
for email, prediction in zip(new_emails, predictions):
    print(f"Email: {email}")
    print(f"Prediction: {'Spam' if prediction == 1 else 'Not Spam'}\n")

In [14]:
# plt confusion matrix


## Establishing benchmarks & comparing models

- since we have a random section of test and training data we need an average of 5 runs to determine the benchmark of that model

In [15]:

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'embedding_dim': [50, 100, 200],
    'rnn_units': [32, 64, 128],
    'rnn_type': ['GRU'],
    'dropout': [0.2, 0.4],
    'optimizer': ['rmsprop', 'sgd'],
    'batch_size': [32, 64],
    'activation': ['relu', 'tanh'],
}

# lstm_combinations = execute_grid_search(
#     param_grid, x_train, y_train, x_test, y_test, num_words
# )


In [16]:

# print(lstm_combinations['table'].T)

# print(lstm_combinations['best_accuracy'], lstm_combinations['best_model_accuracy'])

In [17]:
# results = lstm_combinations['table']

# results['accuracy'] = results['accuracy'].apply(lambda x: x[0] if isinstance(x, list) else x)


# print(results[results['accuracy'] > 0.99])

# filtered_df = results[results['accuracy'] > 0.987]