In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [4]:
# Load the dataset
data = pd.read_csv("./../Dataset/newData.csv")

# Split the dataset into training and testing sets
# Drop rows with missing text values
data = data.dropna(subset=['Transformer_text'])

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize and pad the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['Transformer_text'])

# Convert text data to sequences and pad them
train_sequences = tokenizer.texts_to_sequences(train_data['Transformer_text'])
test_sequences = tokenizer.texts_to_sequences(test_data['Transformer_text'])

max_length = 100
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['Label'])

test_labels = label_encoder.transform(test_data['Label'])


In [5]:
embedding_dim = 16  # Adjust based on your preference
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [6]:
epochs = 10  # Adjust based on your training preference
model.fit(train_padded, train_labels, epochs=epochs, validation_data=(test_padded, test_labels))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1bc814a6150>

In [7]:
loss, accuracy = model.evaluate(test_padded, test_labels)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.3565
Test Accuracy: 0.8858


In [10]:
from tensorflow.keras.layers import Bidirectional

model2 = Sequential()
model2.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length))
model2.add(Bidirectional(LSTM(64, return_sequences=True)))
model2.add(Bidirectional(LSTM(64)))
model2.add(Dense(1, activation='sigmoid'))


In [15]:
# Compile the model with Adam optimizer
from tensorflow.keras.optimizers import Adam
model2.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 5
model2.fit(train_padded, train_labels, epochs=epochs, validation_data=(test_padded, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1bc94639210>

In [16]:
# Evaluate the model
loss, accuracy = model2.evaluate(test_padded, test_labels)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.1413
Test Accuracy: 0.9797


**STREAMLIT GUI**

In [None]:
import streamlit as st
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# # Load the trained model
# model = tf.keras.models.load_model('path/to/your/trained/model')

# Load the tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(['your', 'list', 'of', 'common', 'words'])

# Define the maximum sequence length (adjust based on your model)
max_length = 100

# Streamlit App
def main():
    st.title("SPAM vs HAM Email Classification")

    # User input
    user_input = st.text_area("Enter the email text:")

    if st.button("Predict"):
        # Tokenize and pad the input text
        input_sequence = tokenizer.texts_to_sequences([user_input])
        padded_input = pad_sequences(input_sequence, maxlen=max_length, padding='post', truncating='post')

        # Make the prediction
        prediction = model2.predict(padded_input)

        # Display the result
        if prediction[0][0] > 0.5:
            st.success("Prediction: HAM (Legitimate Email)")
        else:
            st.error("Prediction: SPAM")

if __name__ == '__main__':
    main()
