In [10]:
# Define a list of predefined rules for classifying emails
spam_keywords = ['free', 'soon', 'discount', 'limited time', 'urgent', 'act now']
spam_threshold = 3  # Minimum number of spam keywords required for an email to be classified as spam

# Function to classify an email as "ham" or "spam" based on predefined rules
def classify_email(email_content):
    # Convert email content to lowercase for case-insensitive matching
    email_content = email_content.lower()
    
    # Check for the presence of spam keywords in the email content
    spam_count = sum(keyword in email_content for keyword in spam_keywords)
    
    # Apply the classification rule
    if spam_count >= spam_threshold:
        return 'spam'
    else:
        return 'ham'

# Sample data - emails to be classified
emails = [
    "Dear customer, congratulations! You've won a free trip to an exotic destination. it is urgent to respond soon",
    "Hello, I hope you're doing well. Let's catch up soon.",
    "Limited time offer: Get a 50% discount on our latest products! act now!",
    "Reminder: Your payment is due by tomorrow. Please urgent limited time, act now.",
    "Hi, just wanted to share an interesting article I came across.",
]

# Classify each email using the classify_email function
for email in emails:
    classification = classify_email(email)
    print(f"Email: '{email}' is classified as '{classification}'.")


Email: 'Dear customer, congratulations! You've won a free trip to an exotic destination. it is urgent to respond soon' is classified as 'spam'.
Email: 'Hello, I hope you're doing well. Let's catch up soon.' is classified as 'ham'.
Email: 'Limited time offer: Get a 50% discount on our latest products! act now!' is classified as 'spam'.
Email: 'Reminder: Your payment is due by tomorrow. Please urgent limited time, act now.' is classified as 'spam'.
Email: 'Hi, just wanted to share an interesting article I came across.' is classified as 'ham'.


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Sample data - emails and their corresponding labels
emails = [
    ("Dear customer, congratulations! You've won a free trip to an exotic destination. it is urgent to respond soon.", "spam"),
    ("Hello, I hope you're doing well. Let's catch up soon.", "ham"),
    ("Limited time offer: Get a 50% discount on our latest products! act now!", "spam"),
    ("Reminder: Your payment is due by tomorrow. Please urgent limited time, act now.", "spam"),
    ("Hi, just wanted to share an interesting article I came across.", "ham"),
]

# Convert the sample data to a pandas DataFrame
df = pd.DataFrame(emails, columns=["email", "label"])

# Extract features from the email text using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["email"])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df["label"], test_size=0.2, random_state=42)

# Train a Naive Bayes classifier on the training data
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Classify the test emails using the trained classifier
y_pred = classifier.predict(X)

# Print the predicted labels for the test emails
for i, email in enumerate(df["email"]):
    print(f"Email: '{email}' is classified as '{y_pred[i]}'.")



Email: 'Dear customer, congratulations! You've won a free trip to an exotic destination. it is urgent to respond soon.' is classified as 'spam'.
Email: 'Hello, I hope you're doing well. Let's catch up soon.' is classified as 'ham'.
Email: 'Limited time offer: Get a 50% discount on our latest products! act now!' is classified as 'spam'.
Email: 'Reminder: Your payment is due by tomorrow. Please urgent limited time, act now.' is classified as 'spam'.
Email: 'Hi, just wanted to share an interesting article I came across.' is classified as 'ham'.


In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

# Sample data - emails and their corresponding labels
emails = [
    ("Dear customer, congratulations! You've won a free trip to an exotic destination. it is urgent to respond soon.", "spam"),
    ("Hello, I hope you're doing well. Let's catch up soon.", "ham"),
    ("Limited time offer: Get a 50% discount on our latest products! act now!", "spam"),
    ("Reminder: Your payment is due by tomorrow. Please urgent limited time, act now.", "spam"),
    ("Hi, just wanted to share an interesting article I came across.", "ham"),
]

# Convert the sample data to a pandas DataFrame
df = pd.DataFrame(emails, columns=["email", "label"])

# Preprocess the data
X = df["email"].values
y = df["label"].values

# Encode the labels
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0, random_state=42)
X_train = X
y_train = y

# Tokenize and pad the email text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
#X_test = tokenizer.texts_to_sequences(X_test)
max_sequence_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
#X_test = pad_sequences(X_test, maxlen=max_sequence_length)

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation="sigmoid"))

# Compile and train the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, validation_data=(X_train, y_train), epochs=20, batch_size=16)

Epoch 1/20
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch

<keras.callbacks.History at 0x20a53dd2d00>

In [53]:
import numpy as np
for i, email in enumerate(X_train):
    predicted_result= model.predict(np.expand_dims(email, axis=0))
    print(f'Email: {X[i]}  is classified as {"ham" if predicted_result < 0.5 else "spam"}.')

Email: Dear customer, congratulations! You've won a free trip to an exotic destination. it is urgent to respond soon.  is classified as spam.
Email: Hello, I hope you're doing well. Let's catch up soon.  is classified as ham.
Email: Limited time offer: Get a 50% discount on our latest products! act now!  is classified as spam.
Email: Reminder: Your payment is due by tomorrow. Please urgent limited time, act now.  is classified as spam.
Email: Hi, just wanted to share an interesting article I came across.  is classified as ham.


In [52]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.0108
Test Accuracy: 1.0000
