In [81]:
# Libraries
import urllib
import tarfile
# import quopri

urllib and tarfile are pre-installed in python > urllib used to for files/website related operations & tarfile is used for zip file related operations

In [82]:
import urllib.request
import os
import tarfile

def fetch_data(base_url, files,download_path):
    for file in files:
        # Construct the full URL
        file_url = f"{base_url}{file}"
        file_download_path = os.path.join(download_path, file)
        print(f"Downloading from: {file_url}")
        print(f"Saving to: {file_download_path}")

        # Download and save the file
        # try:
        #     # Download the file
        #     urllib.request.urlretrieve(file_url, file_download_path)
        #     print(f"File successfully downloaded and saved as {file_download_path}")

        #     # Verify the file is a valid .tar.bz2 and extract it
        #     with tarfile.open(file_download_path, "r:bz2") as tar:
        #         tar.extractall(path=download_path)
        #         print(f"Files successfully extracted to {download_path}")
        # except tarfile.TarError as e:
        #     print(f"TarError while extracting {file}: {e}")
        # except Exception as e:
        #     print(f"An error occurred: {e}")

    return [os.path.join(download_path,dir_name) for dir_name in ("easy_ham", "spam")] 

# Define the base URL, file names, and download path
base_url = "https://spamassassin.apache.org/old/publiccorpus/"
download_path = "/home/t460/Documents/ollama/datasets/spam/" # (absoulte path) Instead can use !from pathlib import Path
files = ["20021010_easy_ham.tar.bz2", "20021010_spam.tar.bz2"]

# Ensure the download directory exists
# os.makedirs(download_path, exist_ok=True)

# Fetch and extract the data
ham_dir , spam_dir = fetch_data(base_url, files, download_path)
print(ham_dir)
print(spam_dir)

Downloading from: https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2
Saving to: /home/t460/Documents/ollama/datasets/spam/20021010_easy_ham.tar.bz2
Downloading from: https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2
Saving to: /home/t460/Documents/ollama/datasets/spam/20021010_spam.tar.bz2
/home/t460/Documents/ollama/datasets/spam/easy_ham
/home/t460/Documents/ollama/datasets/spam/spam


Analysing the structure of the email. Creating dataset which consist of filtered hams&spams to feed to the model.<br>
The dataset should consist of 4 :
- sender's email and other important fields
- subject
- content of the email
- and a column stating is it spam or ham

___

Email contains HTML content or is a plain-text email, you can inspect the MIME type of its body parts. This can be done using Python's email module.
Emails often have MIME types like:
- text/plain for plain-text emails.
- text/html for HTML emails.

If an email has both text/plain and text/html, it's a multipart email where one part is plain text (for compatibility) and another is HTML (for richer formatting).

In [83]:
from email.policy import default
from email.parser import BytesParser
from pathlib import Path
from bs4 import BeautifulSoup

# Function to extract email content (plain text or fallback to HTML)
def get_email_content(email):
    for part in email.walk():
        html = None
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"): # if content_type is other than plain text or html than ignore
            continue
        try:
            # get the character dataset for emails
            charset = part.get_content_charset() or "utf-8"  # Default to UTF-8
            # extract the content with respect to that charaset else throws error of "string argument should contain only ASCII characters"
            content = part.get_payload(decode=True).decode(charset, errors="replace")
        except Exception as e:
            content = part.get_payload(decode=True).decode("utf-8", errors="replace")  # Fallback
        if ctype == "text/plain":
            return content.strip()
        else:
            html = content
    if html:
        soup = BeautifulSoup(html, 'html.parser') #convert to beautifulsoup object
        decoded_html_content = soup.get_text(separator="\n", strip=True) # extract the content from html
        return decoded_html_content
    
# Function to parse email and extract fields
def parse_email(file_path):
    try:
        with open(file_path, 'rb') as f:
            email = BytesParser(policy=default).parse(f)
        
        # Extract fields
        email_data = {
            #"Receiver": msg.get("Delivered-To"),
            "From": email.get("From"),
            #"To": msg.get("To"),
            "Subject": email.get("Subject"),
            "Content": get_email_content(email),
        }
        return email_data
    except Exception as e:
        print(f"Failed to parse {file_path}: {e}")
        return None


In [84]:
from pathlib import Path
import pandas as pd

# Load emails and extract fields
def process_email_directory(directory):
    emails = []
    for file_path in directory.iterdir():
        if file_path.is_file():
            email_data = parse_email(file_path)
            if email_data:
                emails.append(email_data)
    return emails

# Path to email directories
ham_dir = Path(ham_dir)
spam_dir = Path(spam_dir)

# Process ham and spam directories
ham_emails = process_email_directory(ham_dir)
spam_emails = process_email_directory(spam_dir)

#______________________________________________________XXXXXX_________________________________________________________

# Combine ham and spam emails into a single dataset
email_data = pd.DataFrame(ham_emails + spam_emails)


# Add labels for classification
email_data["Label"] = ["ham"] * len(ham_emails) + ["spam"] * len(spam_emails)

# Save to CSV for model training
# email_data.to_csv("email_dataset.csv", index=False)

print(email_data.head())


                                          From  \
0        Chris Kloiber <ckloiber@ckloiber.com>   
1      Dermot Daly <dermot.daly@itsmobile.com>   
2             Owen Byrne <owen@permafrost.net>   
3              Glen Gray <glen@netnoteinc.com>   
4  Eirikur Hallgrimsson <eh@mad.scientist.com>   

                                             Subject  \
0                      Re: RH 8 no DMA for DVD drive   
1                 [ILUG] What HOWTOs for SOHO system   
2                              Re: The case for spam   
3  [ILUG] Retrieving read mail from webmail.eirco...   
4                              process music: Mekons   

                                             Content Label  
0  On Mon, 2002-10-07 at 13:28, Matthias Saou wro...   ham  
1  Hi All,\nI'm trying to set up the following:\n...   ham  
2  Bill Stoddard wrote:\n\n>>No one likes commerc...   ham  
3  Is there a way to get my read email downloaded...   ham  
4  http://reuters.com/news_article.jhtml?type=ent...   ha

Fill missing Values with the most frequent values of each columns 

In [None]:
# check the most frequent values of each columns 
# for column in email_data.columns:
#     print(email_data[column].mode()[0])

# Replacing missing values with the most common values of each column
for column in email_data.columns:
    email_data[column].fillna(email_data[column].mode()[0], inplace=True)  

# # Saving the updated DataFrame to a CSV file
email_data.to_csv("email_dataset2.csv", index=False)


Tim Chapman <timc@2ubh.com>
Re: Java is for kiddies
CONSANTLY
being
bombarded by so-called FREE money-making systems that teases you with limited
information, and when its all said and done, blind-sides you by demanding your
money/credit card information upfront in some slick way,
after-the-fact
!
Yes, I too was as skeptical about such offers and the Internet in general with
all its hype, as you probably are. Fortunate for me, my main business
slowed-down (
I have been self-employed all my life
), so I looked for
something to fit my lifestyle and some other way to assist me in paying my
bills, without working myself to death or loosing more money; then, this
proposal to try something new without any upfront investment (
great! because
I had none
) interested me to click on the link provided. And I dont regret
at all that I did! I am very happy, and happy enough to recommend it to you as
a system that is true to its word. I mean absolutely no upfront money. You join
only if (
when
)

Split the the data

In [88]:
X = email_data.drop('Label',axis=1)
y = email_data['Label'] 

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)
print(X_train.shape)


(804, 3)


Preprocessing step :
- <span style="color:orange"> Tokenization: </span> Split text into words or subwords.
- <span style="color:orange"> Normalization: </span> Lowercase, remove punctuation, etc.
- <span style="color:orange"> word2vec: </span> convert the text to numerical representation 

In [None]:
# from sklearn.base import BaseEstimator , TransformerMixin

# class CustomTransformer(BaseEstimator , TransformerMixin):
#     pass