In [None]:
from google.colab import files

# Upload the dataset files
uploaded = files.upload()

Saving 20021010_spam.tar.bz2 to 20021010_spam.tar.bz2
Saving 20021010_hard_ham.tar.bz2 to 20021010_hard_ham.tar.bz2
Saving 20021010_easy_ham.tar.bz2 to 20021010_easy_ham.tar.bz2


In [None]:
import os
import tarfile
import glob
import email
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Function to extract tar.bz2 files, ensuring files are extracted directly into the target directory
def extract_files(file_path, extract_path):
    with tarfile.open(file_path, 'r:bz2') as tar:
        for member in tar.getmembers():
            if member.isfile():
                tar.extract(member, path=extract_path)  # Extract files directly into the target directory

# Extract datasets
# Assuming the .tar.bz2 files are in the current directory, adjust paths if needed
extract_files('20021010_easy_ham.tar.bz2', 'easy_ham')
extract_files('20021010_hard_ham.tar.bz2', 'hard_ham')
extract_files('20021010_spam.tar.bz2', 'spam')

# Function to load emails from a directory
def load_emails_from_directory(directory):
    emails = []
    for filepath in glob.glob(os.path.join(directory, '**', '*'), recursive=True): # Search recursively within the directory
        if os.path.isfile(filepath):  # Check if it's a file before opening
            try:
                with open(filepath, 'r', encoding='latin-1') as file:
                    msg = email.message_from_file(file)
                    for part in msg.walk():
                        if part.get_content_type() == 'text/plain':
                            emails.append(part.get_payload(decode=True).decode('latin-1', errors='ignore')) # Handle potential decoding errors
                            break
            except Exception as e:
                print(f"Error reading file {filepath}: {e}") # Print errors for debugging
    return emails

# Load emails
easy_ham_emails = load_emails_from_directory('easy_ham')
hard_ham_emails = load_emails_from_directory('hard_ham')
spam_emails = load_emails_from_directory('spam')

print("Number of easy ham emails:", len(easy_ham_emails)) # Check number of emails loaded
print("Number of hard ham emails:", len(hard_ham_emails))
print("Number of spam emails:", len(spam_emails))

# ... (rest of the code remains the same)
ham_emails = easy_ham_emails + hard_ham_emails
labels = [0] * len(ham_emails) + [1] * len(spam_emails)
emails = ham_emails + spam_emails

data = pd.DataFrame({'email': emails, 'label': labels})

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['email'], data['label'], test_size=0.2, random_state=42)

# Vectorize emails using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

Number of easy ham emails: 2551
Number of hard ham emails: 128
Number of spam emails: 292


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Confusion Matrix:\n{conf_matrix}')


Accuracy: 0.9092
Precision: 1.0000
Recall: 0.1429
F1 Score: 0.2500
Confusion Matrix:
[[532   0]
 [ 54   9]]


In [None]:
# Print detailed evaluation metrics
def print_evaluation_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print("Evaluation Metrics:")
    print("===================")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)

print_evaluation_metrics(y_test, y_pred)


Evaluation Metrics:
Accuracy:  0.9092
Precision: 1.0000
Recall:    0.1429
F1 Score:  0.2500

Confusion Matrix:
[[532   0]
 [ 54   9]]


In [None]:
# Print sample predictions
def print_sample_predictions(X_test, y_test, y_pred, n=5):
    test_data = X_test.reset_index(drop=True)
    sample_data = test_data.sample(n)
    sample_indices = sample_data.index

    print("\nSample Predictions:")
    print("===================")
    for idx in sample_indices:
        print(f"\nEmail: {test_data[idx][:500]}...")
        print(f"Actual Label: {'Spam' if y_test.iloc[idx] == 1 else 'Ham'}")
        print(f"Predicted Label: {'Spam' if y_pred[idx] == 1 else 'Ham'}")

print_sample_predictions(X_test, y_test, y_pred)



Sample Predictions:

Email: This article from NYTimes.com 
has been sent to you by khare@alumni.caltech.edu.


Sure does explain FoRK :-)

not yet abandoned,
  Rohit

khare@alumni.caltech.edu


Some Friends, Indeed, Do More Harm Than Good

September 10, 2002
By MARY DUENWALD 




 

Friends are supposed to be good for you. In recent years,
scientific research has suggested that people who have
strong friendships experience less stress, they recover
more quickly from heart attacks and they are likely to live
longer than the...
Actual Label: Ham
Predicted Label: Ham

Email: Inn Share <shareinnn@yahoo.com> writes:

> Hi,all:
> 
> Does anyone know how to list the biggest file in my
> root directory?or the second biggest ..etc...
> 
> Because I want to find out what is the reason cause my
> root all most full.

find / -xdev -type f -exec du -sk {} \; | sort -rn | head -5

        -xdev will stop find recursing into other filesystems.

Cheers
Tiarnan


-- 
Tiarnán Ó Corráin
Consultant / Sys