In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Load the CSV data
data = pd.read_csv('spam_ham_dataset.csv')

# Extract messages and labels
messages = data['text'].tolist()
labels = data['label_num'].tolist()  # 0 for ham (not spam), 1 for spam

# Tokenize and preprocess the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(messages)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(messages)
padded_sequences = pad_sequences(sequences, padding='post')

# Split the data into training and validation sets
split_ratio = 0.8  # 80% for training, 20% for validation
split_index = int(len(messages) * split_ratio)

train_sequences = padded_sequences[:split_index]
train_labels = np.array(labels[:split_index])  # Convert labels to NumPy array

val_sequences = padded_sequences[split_index:]
val_labels = np.array(labels[split_index:])  # Convert labels to NumPy array

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index) + 1, output_dim=16, input_length=padded_sequences.shape[1]),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),  # Adding an additional dense layer
    tf.keras.layers.Dropout(0.2),  # Adding a dropout layer for regularization
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


# Define callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),  # Stop training if validation loss stops improving
    tf.keras.callbacks.ModelCheckpoint(filepath='best_model.h5', save_best_only=True),  # Save the best model checkpoint
    tf.keras.callbacks.TensorBoard(log_dir='./logs')  # Log data for TensorBoard visualization
]

# Train the model with callbacks
num_epochs = 100

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(
    train_sequences, train_labels,
    epochs=num_epochs,
    validation_data=(val_sequences, val_labels),
    callbacks=callbacks
)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


<keras.callbacks.History at 0x1cdca36ff10>

In [15]:
# Evaluate the model on the validation data
evaluation = model.evaluate(val_sequences, val_labels)

# Print the evaluation results
print("Validation Loss:", evaluation[0])
print("Validation Accuracy:", evaluation[1])

Validation Loss: 0.08127790689468384
Validation Accuracy: 0.9758453965187073


In [21]:
from dotenv import load_dotenv
import os
import imaplib
import email
from email.header import decode_header
from datetime import datetime, timedelta

load_dotenv()

username = os.environ.get("EMAIL")
password = os.environ.get("PASSWORD")

# Connect to Gmail IMAP server
mail = imaplib.IMAP4_SSL("imap.gmail.com")
mail.login(username, password)

mail.select("inbox")

target_date = datetime(2023, 8, 20)
since_date = target_date - timedelta(days=1)
imap_since_date = since_date.strftime("%d-%b-%Y").upper()

search_criteria = f'SINCE "{imap_since_date}"'

# Fetch the UIDs of emails matching the search criteria
status, email_data = mail.uid("search", None, search_criteria)
uids = email_data[0].split()
emails = []

# Loop through email IDs and retrieve email content
for i in range(len(uids)):
    uid = uids[i]  # Get the current email UID
    status, msg_data = mail.uid("fetch", uid, "(RFC822)")
    msg = email.message_from_bytes(msg_data[0][1])
    
    subject_bytes, encoding = decode_header(msg["Subject"])[0]
    if isinstance(subject_bytes, bytes):
        subject = subject_bytes.decode(encoding or "utf-8")
    else:
        subject = str(subject_bytes)
        
    email_body = ""
    for part in msg.walk():
        if part.get_content_type() == "text/plain":
            try:
                email_body = part.get_payload(decode=True).decode("utf-8")
            except UnicodeDecodeError:
                email_body = part.get_payload(decode=True).decode("latin1")
            break  # Stop at the first text/plain part

    email_body = ' '.join(email_body.split()) 
    emails.append(' '.join([subject, email_body]))


In [22]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the pre-trained model
loaded_model = tf.keras.models.load_model('best_model.h5')

# Define the sequence length expected by the model
max_sequence_length = 5916

# Tokenize and preprocess emails text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(emails)
email_text_seqs = tokenizer.texts_to_sequences(emails)
email_text_padded = pad_sequences(email_text_seqs, padding='post', maxlen=max_sequence_length)

# Define a spam threshold (adjust as needed)
spam_threshold = 0.1

# List to store spam classification results
email_classifications = []

# Predict using the loaded model for all emails
predicted_probs = loaded_model.predict(email_text_padded)


# Loop through the predicted probabilities and classify emails
for i, predicted_prob in enumerate(predicted_probs):
    subject = emails[i]  # Merged subject and body text
    email_date = msg["Date"]  # You need to fetch this from the email
    uid = uids[i]  # Get the UID of the email

    # Classify as spam or not spam
    is_spam = predicted_prob >= spam_threshold
    
    # Append the classification result to the list
    email_classifications.append({
        "subject": subject,
        "body": email_body,  # Use the correct variable name: email_body
        "date": email_date,
        "is_spam": is_spam,
        "predicted_prob": predicted_prob * 100,
        "msg_id": uid
    })

# Print the classification results
for email_classification in email_classifications:
    print("Date:", email_classification["date"])
    print("Subject:", email_classification["subject"])
    print("Is Spam:", email_classification["is_spam"])
    print("Spam %", email_classification["predicted_prob"])
    print("Msg Id", email_classification["msg_id"])
    print("\n---\n")

Date: Fri, 25 Aug 2023 21:23:47 +0000
Subject: GitHub Explore today Aug 19 Explore today on GitHub, Aug 18 - Aug 19 See more repositories based on your interests at https://github.com/explore Recommended topic ----------------- https://github.com/topics/vim9script Recommended spotlight --------------------- Maintaining Balance for Open Source Maintainers https://opensource.guide/maintaining-balance-for-open-source-maintainers/ <p>Tips for self-care and avoiding burnout as a maintainer.</p> Upcoming event recommended by GitHub ------------------------------------ js13kGames competition https://js13kgames.com/ August 13, 2023 - September 13, 2023 Do you have what it takes to create a game in ≤ 13kB of JS, CSS and HTML? Collection recommended by GitHub -------------------------------- Game Engines https://github.com/collections/game-engines Frameworks for building games across multiple platforms. That's everything we found for you, for now. Visit https://github.com/explore to see the late

In [25]:
import imaplib
import email
from email.header import decode_header
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Connect to the IMAP server using SSL
mail = imaplib.IMAP4_SSL("imap.gmail.com")

# Login with the provided credentials
mail.login(os.environ.get("EMAIL"), os.environ.get("PASSWORD"))

# Select the mailbox you want to access (e.g., "inbox")
mail.select("inbox")

# Define a spam folder name
spam_folder_name = "Spam"

# Check if the spam folder exists
result, data = mail.list()
if f"{spam_folder_name} " not in [item.decode() for item in data]:
    # Create the spam folder
    mail.create(spam_folder_name)

# Loop through the email classifications and move spam emails to the spam folder
for email_classification in email_classifications:
    if email_classification["is_spam"]:
        msg_id = email_classification["msg_id"]

        # Copy the email to the spam folder
        copy_result = mail.uid("COPY", msg_id, spam_folder_name)
        if copy_result[0] == "OK":
            print(f"Copied email with subject '{email_classification['subject']}' to '{spam_folder_name}' folder.")

# Close the mailbox
mail.close()

# Logout from the email server
mail.logout()


Copied email with subject '(Cardholder perk) Brian Song, you're eligible to activate these
 Chase Offers ' to 'Spam' folder.
Copied email with subject 'Your credit score has recently changed ' to 'Spam' folder.
Copied email with subject 'Review new activity on a dormant account ' to 'Spam' folder.
Copied email with subject '**Brian, this is your invitation to refer friends and earn up to
 $500 cash back ' to 'Spam' folder.
Copied email with subject 'Don't Wait: We'd like to reward you with a special auto financing
 offer, up to $35,000 ' to 'Spam' folder.
Copied email with subject 'You've been gone a while. See what's new to you! ' to 'Spam' folder.
Copied email with subject 'CIRCOR Career Site has 4 jobs you may be interested in ' to 'Spam' folder.
Copied email with subject 'SURPRISE DISCOUNT for YOU- Grab it before its gone ' to 'Spam' folder.
Copied email with subject 'Trending Posts on Levels.fyi ' to 'Spam' folder.
Copied email with subject 'Meetup Hi Brian, Please meet us at Time

('BYE', [b'LOGOUT Requested'])