In [8]:
# Libraries
import urllib
import tarfile
import numpy as np
# import quopri

urllib and tarfile are pre-installed in python > urllib used to for files/website related operations & tarfile is used for zip file related operations

In [1]:
import urllib.request
import os
import tarfile

def fetch_data(base_url, files,download_path):
    for file in files:
        # Construct the full URL
        file_url = f"{base_url}{file}"
        file_download_path = os.path.join(download_path, file)
        print(f"Downloading from: {file_url}")
        print(f"Saving to: {file_download_path}")

        # Download and save the file
        # try:
        #     # Download the file
        #     urllib.request.urlretrieve(file_url, file_download_path)
        #     print(f"File successfully downloaded and saved as {file_download_path}")

        #     # Verify the file is a valid .tar.bz2 and extract it
        #     with tarfile.open(file_download_path, "r:bz2") as tar:
        #         tar.extractall(path=download_path)
        #         print(f"Files successfully extracted to {download_path}")
        # except tarfile.TarError as e:
        #     print(f"TarError while extracting {file}: {e}")
        # except Exception as e:
        #     print(f"An error occurred: {e}")

    return [os.path.join(download_path,dir_name) for dir_name in ("easy_ham", "spam")] 

# Define the base URL, file names, and download path
base_url = "https://spamassassin.apache.org/old/publiccorpus/"
download_path = "/home/t460/Documents/ollama/datasets/spam/" # (absoulte path) Instead can use !from pathlib import Path
files = ["20021010_easy_ham.tar.bz2", "20021010_spam.tar.bz2"]

# Ensure the download directory exists
# os.makedirs(download_path, exist_ok=True)

# Fetch and extract the data
ham_dir , spam_dir = fetch_data(base_url, files, download_path)
print(ham_dir)
print(spam_dir)

Downloading from: https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2
Saving to: /home/t460/Documents/ollama/datasets/spam/20021010_easy_ham.tar.bz2
Downloading from: https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2
Saving to: /home/t460/Documents/ollama/datasets/spam/20021010_spam.tar.bz2
/home/t460/Documents/ollama/datasets/spam/easy_ham
/home/t460/Documents/ollama/datasets/spam/spam


Analysing the structure of the email. Creating dataset which consist of filtered hams&spams to feed to the model.<br>
The dataset should consist of 4 :
- sender's email and other important fields
- subject
- content of the email : Email contains HTML content or is a plain-text email, you can inspect the MIME type of its body parts. This can be done using Python's email module.
- and a column stating is it spam or ham

In [2]:
from email.policy import default
from email.parser import BytesParser
from pathlib import Path
from bs4 import BeautifulSoup

# Function to extract email content (plain text or fallback to HTML)
def get_email_content(email):
    for part in email.walk():
        html = None
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"): # if content_type is other than plain text or html than ignore
            continue
        try:
            # get the character dataset for emails
            charset = part.get_content_charset() or "utf-8"  # Default to UTF-8
            # extract the content with respect to that charaset else throws error of "string argument should contain only ASCII characters"
            content = part.get_payload(decode=True).decode(charset, errors="replace")
        except Exception as e:
            content = part.get_payload(decode=True).decode("utf-8", errors="replace")  # Fallback
        if ctype == "text/plain":
            return content.strip()
        else:
            html = content
    if html:
        soup = BeautifulSoup(html, 'html.parser') #convert to beautifulsoup object
        decoded_html_content = soup.get_text(separator="\n", strip=True) # extract the content from html
        return decoded_html_content
    
# Function to parse email and extract fields
def parse_email(file_path):
    try:
        with open(file_path, 'rb') as f:
            email = BytesParser(policy=default).parse(f)
        
        # Extract fields
        email_data = {
            #"Receiver": msg.get("Delivered-To"),
            "From": email.get("From"),
            #"To": msg.get("To"),
            "Subject": email.get("Subject"),
            "Content": get_email_content(email),
        }
        return email_data
    except Exception as e:
        print(f"Failed to parse {file_path}: {e}")
        return None


In [3]:
from pathlib import Path
import pandas as pd

# Load emails and extract fields
def process_email_directory(directory):
    emails = []
    for file_path in directory.iterdir():
        if file_path.is_file():
            email_data = parse_email(file_path)
            if email_data:
                emails.append(email_data)
    return emails

# Path to email directories
ham_dir = Path(ham_dir)
spam_dir = Path(spam_dir)

# Process ham and spam directories
ham_emails = process_email_directory(ham_dir)
spam_emails = process_email_directory(spam_dir)

#______________________________________________________XXXXXX_________________________________________________________

# Combine ham and spam emails into a single dataset
email_data = pd.DataFrame(ham_emails + spam_emails)


# Add labels for classification
email_data["Label"] = ["ham"] * len(ham_emails) + ["spam"] * len(spam_emails)

# Save to CSV for model training
# email_data.to_csv("email_dataset.csv", index=False)

print(email_data.head())


                                          From  \
0        Chris Kloiber <ckloiber@ckloiber.com>   
1      Dermot Daly <dermot.daly@itsmobile.com>   
2             Owen Byrne <owen@permafrost.net>   
3              Glen Gray <glen@netnoteinc.com>   
4  Eirikur Hallgrimsson <eh@mad.scientist.com>   

                                             Subject  \
0                      Re: RH 8 no DMA for DVD drive   
1                 [ILUG] What HOWTOs for SOHO system   
2                              Re: The case for spam   
3  [ILUG] Retrieving read mail from webmail.eirco...   
4                              process music: Mekons   

                                             Content Label  
0  On Mon, 2002-10-07 at 13:28, Matthias Saou wro...   ham  
1  Hi All,\nI'm trying to set up the following:\n...   ham  
2  Bill Stoddard wrote:\n\n>>No one likes commerc...   ham  
3  Is there a way to get my read email downloaded...   ham  
4  http://reuters.com/news_article.jhtml?type=ent...   ha

Fill missing Values with the most frequent values of each columns 

In [4]:
# check the most frequent values of each columns 
# for column in email_data.columns:
#     print(email_data[column].mode()[0])

# Replacing missing values with the most common values of each column
for column in email_data.columns:
    email_data[column].fillna(email_data[column].mode()[0], inplace=True)  

# # Saving the updated DataFrame to a CSV file
# email_data.to_csv("email_dataset2.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  email_data[column].fillna(email_data[column].mode()[0], inplace=True)


Split the the data

In [5]:
email_data = pd.read_csv('email_dataset.csv')
X = email_data.drop('Label',axis=1)
y = email_data['Label'] 

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)

Preprocessing step :
- <span style="color:orange"> Tokenization: </span> Split text into words or subwords.
- <span style="color:orange"> Normalization: </span> Lowercase, remove punctuation, etc.


In [None]:
import spacy
from urlextract import URLExtract  # Ensure the package is installed
from collections import Counter

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize URL extractor
url_extractor = URLExtract()
"""
def email_transform(sent):
    # Process the email content

    doc = nlp(sent)
    filtered_words = []

    for token in doc:
        # Extract and replace URLs with "URL"
        if url_extractor.has_urls(token.text):  # Checks if the token contains a URL
            filtered_words.append("url")
        elif token.like_num:
            filtered_words.append("number")
        elif token.is_alpha and not token.is_stop:  # Remove stopwords and keep only text
            filtered_words.append(token.lemma_.lower())

    return filtered_words
"""

def email_transform(sent):
    for email in X:
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

# Transform the email content
sample_train_email = X_train['Content'][:3]
print(sample_train_email)

# for i in sample_train_email:
#     print(i)
#     break
    # processed_email = email_transform(i)
    # print(f'Processed Email:{processed_email}')
    # word_counter = Counter()
    # for email in processed_email:
    #     word_counter.update(email.split())
    # print("Word Frequencies:")
    # print(f'{word_counter}\n')


78     ive just gotton myself a modem (no its not a w...
29     > So now Osama bin Laden is Hitler. And Saddam...
280    Eirikur Hallgrimsson wrote:\n> It's official, ...
Name: Content, dtype: object


In [9]:
import spacy
from urlextract import URLExtract  # Ensure the package is installed
from collections import Counter
from sklearn.base import BaseEstimator , TransformerMixin

class CustomEmailTransformer(BaseEstimator , TransformerMixin):  # for subject and content

    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.url_extractor = URLExtract()

    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        
        X_transformed_word = []
        for i in X:
            transformed_word = [] # if want array's of counter
            doc = self.nlp(i)
            for token in doc:
                # Extract and replace URLs with "URL"
                if self.url_extractor.has_urls(token.text):  # Checks if the token contains a URL
                    transformed_word.append("url")
                elif token.like_num:
                    transformed_word.append("number")
                elif token.is_alpha and not token.is_stop:  # Remove stopwords and keep only text
                    transformed_word.append(token.lemma_.lower())

                word_counter = Counter()
                for email in transformed_word: # Count word frequencies
                    word_counter.update(email.split())
            X_transformed_word.append(word_counter)

        return np.array(X_transformed_word)

In [10]:
sample_train_content = X_train['Subject'][:3]

word_count = CustomEmailTransformer().fit_transform(sample_train_content)

# for i in sample_train_content:
#     word_count = transformer .transform(i)
print("Word Frequencies:")
print(f'{word_count}')

Word Frequencies:
[Counter({'ilug': 1, 'modem': 1, 'problems': 1})
 Counter({'zzzzteana': 1, 'coming': 1, 'firestorm': 1})
 Counter({'holiday': 1, 'season': 1, 'number': 1, 'begin': 1})]


Tranform the "From" feature column : extract the domain name

In [163]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder # for email domain
import re

# # Function to extract domain using regex
def extract_domain(email):
    match = re.search(r'@([\w.-]+)', email)  # Matches domain after @
    return match.group(1).lower() if match else 'unknown'

# Apply function to emails
sample_train_domain = X_train['From'] # print(sample_train_domain)
domains = [extract_domain(email) for email in sample_train_domain] #print(domains) 
domain_reshaped = np.array(domains).reshape(-1, 1) # Requires 2D dimension 


# Encode the domain using OrdinalEncoder
OneHot_Encoder = OneHotEncoder()
sample_train_domain_encoded = OneHot_Encoder.fit_transform(domain_reshaped)

# Print the encoded values
print("Original Domains:")
print(domains)
print("\nEncoded Domains:")
print(sample_train_domain_encoded.toarray())


Original Domains:
['redpie.com', 'ee.ed.ac.uk', 'barrera.org', 'mail.com', 'none.com', 'yahoo.com', 'example.com', 'example.com', 'indiatimes.com', 'docserver.cac.washington.edu', 'qu.to', 'wanadoo.fr', 'silcom.com', 'iol.ie', 'arabia.com', 'insiq.us', 'punkass.com', 'ig.com.br', 'noskillz.com', 'email.com', 'barrera.org', 'iol.ie', 'yahoo.com', 'svanstrom.com', 'example.com', 'hotmail.com', 'permafrost.net', 'sunglasses.com', 'mithral.com', 'insiq.us', 'subdimension.com', 'slack.net', 'caramail.com', 'evergo.net', 'insiq.us', 'shipwright.com', 'cursor-system.com', 'techmonkeys.net', 'framesetup.com', 'example.com', 'reset.jp', 'netnoteinc.com', 'eircom.net', 'yelsew.com', 'etang.com', 'interszkola.pl', 'shipwright.com', 's3.serveimage.com', 'infinetivity.com', 'iol.ie', 'att.net', 'deepeddy.com', 'eecs.berkeley.edu', 'hotmail.com', 'slack.net', 'alltel.net', 'mx03.readyserve21.com', 'btamail.net.cn', 'corvil.com', 'argote.ch', 'frogstone.net', 'tuatha.org', 'srv0.ems.ed.ac.uk', 'acces

In [11]:
import re
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class EmailDomainEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialize OneHotEncoder
        self.encoder = OneHotEncoder()
        self.domains = None  # Placeholder for domains

    @staticmethod
    def extract_domain(email):
        """
        Extract the domain from an email address using regex.
        If no domain is found, return 'unknown'.
        """
        match = re.search(r'@([\w.-]+)', email)  # Matches domain after @
        return match.group(1).lower() if match else 'unknown'

    def fit(self, X, y=None):
        """
        Extract domains from the email addresses in X and fit the OneHotEncoder.
        """
        # Extract domains from email addresses
        self.domains = [self.extract_domain(email) for email in X]
        # Reshape domains for encoding
        domain_reshaped = np.array(self.domains).reshape(-1, 1)
        # Fit the encoder
        self.encoder.fit(domain_reshaped)
        return self

    def transform(self, X):
        """
        Transform the email addresses in X into encoded domain vectors.
        """
        # Extract domains from email addresses
        domains = [self.extract_domain(email) for email in X]
        # Reshape domains for encoding
        domain_reshaped = np.array(domains).reshape(-1, 1)
        # Transform using the fitted encoder
        return self.encoder.transform(domain_reshaped).toarray()

    def fit_transform(self, X, y=None):
        """
        Fit the encoder and transform the email addresses in one step.
        """
        return super().fit_transform(X, y)



In [22]:
# Example Usage
# Sample data (replace X_train['From'] with actual data)
sample_train_domain = X_train['From']

# Instantiate the encoder
domain_encoder = EmailDomainEncoder()

# Fit and transform the sample data
encoded_domains = domain_encoder.fit_transform(sample_train_domain)

# Print the results
print("Original Domains:")
print(domain_encoder.domains)
print("\nEncoded Domains:")
print(encoded_domains)

Original Domains:
['redpie.com', 'ee.ed.ac.uk', 'barrera.org', 'mail.com', 'none.com', 'yahoo.com', 'example.com', 'example.com', 'indiatimes.com', 'docserver.cac.washington.edu', 'qu.to', 'wanadoo.fr', 'silcom.com', 'iol.ie', 'arabia.com', 'insiq.us', 'punkass.com', 'ig.com.br', 'noskillz.com', 'email.com', 'barrera.org', 'iol.ie', 'yahoo.com', 'svanstrom.com', 'example.com', 'hotmail.com', 'permafrost.net', 'sunglasses.com', 'mithral.com', 'insiq.us', 'subdimension.com', 'slack.net', 'caramail.com', 'evergo.net', 'insiq.us', 'shipwright.com', 'cursor-system.com', 'techmonkeys.net', 'framesetup.com', 'example.com', 'reset.jp', 'netnoteinc.com', 'eircom.net', 'yelsew.com', 'etang.com', 'interszkola.pl', 'shipwright.com', 's3.serveimage.com', 'infinetivity.com', 'iol.ie', 'att.net', 'deepeddy.com', 'eecs.berkeley.edu', 'hotmail.com', 'slack.net', 'alltel.net', 'mx03.readyserve21.com', 'btamail.net.cn', 'corvil.com', 'argote.ch', 'frogstone.net', 'tuatha.org', 'srv0.ems.ed.ac.uk', 'acces

- <span style="color:orange"> word2vec: </span> convert the text to numerical representation 

In [13]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1
                            for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)),shape=(len(X), self.vocabulary_size + 1))

In [14]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(word_count)
X_few_vectors

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10 stored elements and shape (3, 11)>

In [15]:
X_few_vectors.toarray()

array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])

In [16]:
vocab_transformer.vocabulary_

{'ilug': 1,
 'modem': 2,
 'problems': 3,
 'zzzzteana': 4,
 'coming': 5,
 'firestorm': 6,
 'holiday': 7,
 'season': 8,
 'number': 9,
 'begin': 10}

In [20]:
from sklearn.pipeline import Pipeline
preprocessing_pipeline = Pipeline([
    ("email_to_wordcount", CustomEmailTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])
# preprocessing_pipeline.fit_transform(X_train_content)

In [29]:
from sklearn.compose import ColumnTransformer

content_attrib = ['Subject','Content']
domain_attrib = ['From']

final_preprocessing_pipeline = ColumnTransformer([
    ('content',preprocessing_pipeline,content_attrib),
    ('domain',EmailDomainEncoder(),domain_attrib)
])

final_data = final_preprocessing_pipeline.fit_transform(X_train)
print(final_data)

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 2.

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(max_iter=1000, random_state=42)
score = cross_val_score(log_clf, final_data, y_train, cv=3)
score.mean()

ValueError: Found input variables with inconsistent numbers of samples: [2, 804]