In [9]:
import zipfile
import os

# Upload your zip files (benign.zip and malign.zip) to Colab first

# Unzipping
with zipfile.ZipFile("benign.zip", 'r') as zip_ref:
    zip_ref.extractall("benign")
with zipfile.ZipFile("malign.zip", 'r') as zip_ref:
    zip_ref.extractall("malign")


In [13]:
import os
import csv

# Define the directories for benign and malign email text files
benign_dir = '/content/benign/benign'
malign_dir = '/content/malign/malign'

# Define the path for the new CSV file
output_csv = 'emails.csv'

# Define the headers for the CSV file
headers = ['Subject', 'Body', 'Label']

# Function to process a directory and write its contents to the CSV file
def process_directory(directory, label, writer):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()

                # Split the content by lines and iterate through
                lines = content.splitlines()
                subject = ""
                body = []
                reading_body = False

                for line in lines:
                    if line.startswith('Subject:'):
                        subject = line[len('Subject:'):].strip()
                        reading_body = True
                    elif reading_body:
                        if line.startswith('Subject:'):
                            body_text = " ".join(body).strip()
                            writer.writerow({'Subject': subject, 'Body': body_text, 'Label': label})
                            subject = line[len('Subject:'):].strip()
                            body = []
                        else:
                            body.append(line.strip())

                if subject and body:
                    body_text = " ".join(body).strip()
                    writer.writerow({'Subject': subject, 'Body': body_text, 'Label': label})

# Open the CSV file for writing
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()

    # Process each directory
    process_directory(benign_dir, 'benign', writer)
    process_directory(malign_dir, 'malign', writer)

print(f"CSV file created at: {output_csv}")


CSV file created at: emails.csv


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the CSV file
df = pd.read_csv('emails.csv')

# Combine subject and body
df['text'] = df['Subject'] + " " + df['Body']

# Convert labels to binary (0 or 1)
df['label'] = df['Label'].apply(lambda x: 1 if x == 'malign' else 0)

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
train_padded = pad_sequences(train_sequences, maxlen=200)

test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, maxlen=200)


In [15]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=200),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(
    train_padded, train_labels,
    epochs=10,
    validation_data=(test_padded, test_labels)
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# Evaluate on test data
results = model.evaluate(test_padded, test_labels, verbose=2)
print(f"Test Accuracy: {results[1] * 100:.2f}%")

6/6 - 0s - loss: 0.2979 - accuracy: 0.9704 - 220ms/epoch - 37ms/step
Test Accuracy: 97.04%
