In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(f"Using device: {device}")

# Function to compute accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Load and preprocess the AESLC dataset
def load_and_preprocess_dataset():
    dataset = load_dataset("aeslc", split='train')
    emails = [item['email_body'] for item in dataset]
    labels = [1 if item['subject_line'] else 0 for item in dataset]
    return emails, labels

emails, labels = load_and_preprocess_dataset()

# Splitting the dataset into training and validation sets
train_emails, val_emails, train_labels, val_labels = train_test_split(emails, labels, test_size=0.3)

# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_emails, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_emails, truncation=True, padding=True, max_length=512)

# Custom dataset class
class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



cuda
Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# Train the model
trainer.train()

# Save the model
model.save_pretrained("./bert_email_classifier")


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,1.9e-05,1.0


In [3]:
# Evaluate the model
evaluation_results = trainer.evaluate()

# Print evaluation results
print(evaluation_results)

{'eval_loss': 1.893995067803189e-05, 'eval_accuracy': 1.0, 'eval_runtime': 99.3346, 'eval_samples_per_second': 43.6, 'eval_steps_per_second': 5.456, 'epoch': 1.0}


In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the trained model
model_path = "./bert_email_classifier"  # Adjust path if necessary
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Prepare the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
from transformers import pipeline, BertTokenizer, BertForSequenceClassification

# Load the saved model and tokenizer
model_path = "./bert_email_classifier"  # Adjust path if necessary
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create a text classification pipeline
text_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Example email
example_email = "Dear customer, your order has been shipped. Thank you for shopping with us!"

# Use the pipeline for prediction
result = text_classifier(example_email)

# Display the result
print(result)


[{'label': 'LABEL_1', 'score': 0.9999772310256958}]


In [5]:
# Example email
example_email = "Ijasklfjklsda klsajlkdf"

# Tokenize the email
inputs = tokenizer(example_email, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Move inputs to the same device as model
inputs = {k: v.to(device) for k, v in inputs.items()}

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Interpret the prediction
predicted_class = predictions.item()
class_names = ["No Subject", "Has Subject"]  # Adjust based on your labeling
print(f"The email was classified as: {predicted_class}")

The email was classified as: 1


In [20]:
email1 = '''From: example_sender@example.com
To: example_receiver@example.com
Subject: Meeting Schedule Update
Content-Type: text/html; charset="UTF-8"

<html>
<body>
<p>Hello Team,</p>

<p>Please note that the <b>weekly meeting</b> has been rescheduled to <i>Friday, 3 PM</i>.</p>

<p>Best Regards,</p>
<p>John Doe<br>
Project Manager<br>
example_company@example.com<br>
"Excellence is our motto!"<br>
</p>

<p><small>This email may contain confidential information and is intended only for the use of the individual to whom it is addressed. If you are not the intended recipient, you are hereby notified that any dissemination of this communication is strictly prohibited.</small></p>
</body>
</html>
'''

In [21]:
email2 = '''From: notify@newsletters.com
To: example_receiver@example.com
Subject: Your Weekly Tech News Digest
Content-Type: text/plain

Hi there,

Here's your latest tech news:
- Tech Corp announces new innovation.
- The future of AI: An expert's perspective.

Subscribe for more updates at our website www.technews.com.

Cheers,
Tech News Team

----------------------------------------
To unsubscribe from these emails, click here.
----------------------------------------
Note: This email might contain advertisements.
'''

In [24]:
import email
from bs4 import BeautifulSoup
import re

# Define the example emails



def extract_text_from_email(raw_email):
    # Parse the email content
    msg = email.message_from_string(raw_email)

    
    # Extract content type
    content_type = msg.get_content_type()
    text = ""

    if content_type == "text/plain":
        text = msg.get_payload()
    elif content_type == "text/html":
        html = msg.get_payload()
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text()

    print(text)
    # Remove unnecessary data like signatures, disclaimers, etc.
    text = re.sub(r'--[\s\S]*?--', '', text)  # Remove lines between dashes (common in signatures)
    text = re.sub(r'\n+', '\n', text)  # Remove multiple line breaks
    text = text.strip()  # Remove leading/trailing white spaces

    return text

# Extract text from the emails
extracted_text_1 = extract_text_from_email(email1)
extracted_text_2 = extract_text_from_email(email2)
print("################################################\n\n\n\n")
print("Extracted Text from Email 1:\n", extracted_text_1)
#print("\nExtracted Text from Email 2:\n", extracted_text_2)




Hello Team,
Please note that the weekly meeting has been rescheduled to Friday, 3 PM.
Best Regards,
John Doe
Project Manager
example_company@example.com
"Excellence is our motto!"

This email may contain confidential information and is intended only for the use of the individual to whom it is addressed. If you are not the intended recipient, you are hereby notified that any dissemination of this communication is strictly prohibited.



Hi there,

Here's your latest tech news:
- Tech Corp announces new innovation.
- The future of AI: An expert's perspective.

Subscribe for more updates at our website www.technews.com.

Cheers,
Tech News Team

----------------------------------------
To unsubscribe from these emails, click here.
----------------------------------------
Note: This email might contain advertisements.

################################################




Extracted Text from Email 1:
 Hello Team,
Please note that the weekly meeting has been rescheduled to Friday, 3 PM.
Bes