In [1]:
import os
import pandas as pd
from pathlib import Path
import logging
from typing import List, Dict, Any, Optional

from pandas import DataFrame
from sympy.stats.rv import sample_iter_lambdify

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# For LlamaIndex
from llama_index.core import Document, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode, TextNode
from llama_index.readers.file import CSVReader

In [6]:
def load_sample(file_path: Path, sample_size: Optional[int] = 5) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Safely load and validate CSV file, with proper error handling.

    Args:
        file_path: Path to the CSV file
        sample_size: Number of samples to take from each category (if available)

    Returns:
        Tuple of (spam_sample, ham_sample) DataFrames
    """

    if not file_path.exists():
        raise FileNotFoundError(f"CSV file not found at: {file_path}")

    try:
        # First read with pandas for validation and preprocessing
        df = pd.read_csv(file_path)
        logger.info(f"Successfully loaded CSV with {len(df)} rows and {len(df.columns)} columns")

        # Count samples in each category
        spam_count = (df["label"] == 1).sum()
        ham_count = (df["label"] == 0).sum()

        # Take minimum of requested sample size and available samples
        spam_sample_size = min(sample_size, spam_count)
        ham_sample_size = min(sample_size, ham_count)

        # Sample with adjusted sizes
        spam_sample = df[df["label"] == 1].sample(spam_sample_size) if spam_sample_size > 0 else pd.DataFrame()
        ham_sample = df[df["label"] == 0].sample(ham_sample_size) if ham_sample_size > 0 else pd.DataFrame()

        return spam_sample, ham_sample

    except Exception as e:
        logger.error(f"Error loading CSV file: {str(e)}")
        raise

def preprocess_email_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess email data for better indexing in LlamaIndex

    Args:
        df: DataFrame with raw email data

    Returns:
        Preprocessed DataFrame
    """
    logger.info("Preprocessing email data...")

    # Make a copy to avoid modifying the original
    processed_df = df.copy()

    # Fill empty fields with empty strings to avoid NaN issues
    processed_df = processed_df.fillna("")

    # Create a combined text field that LlamaIndex can use more effectively
    processed_df['combined_text'] = processed_df.apply(
        lambda row: (
            f"From: {row['sender']}\n"
            f"To: {row['receiver']}\n"
            f"Date: {row['date']}\n"
            f"Subject: {row['subject']}\n\n"
            f"{row['body']}"
        ), axis=1
    )

    logger.info("Preprocessing complete")
    return processed_df

def create_documents_from_dataframe(df: pd.DataFrame) -> List[Document]:
    """
    Convert DataFrame rows to LlamaIndex Document objects

    Args:
        df: Preprocessed DataFrame

    Returns:
        List of LlamaIndex Document objects
    """
    logger.info("Creating LlamaIndex documents from DataFrame...")

    documents = []

    for idx, row in df.iterrows():
        # Create metadata dictionary with only essential fields
        metadata = {
            'sender': str(row['sender'])[:100],
            'receiver': str(row['receiver'])[:100],
            'date': str(row['date'])[:30],
            'subject': str(row['subject'])[:200],
            'label': str(row.get('label', ''))[:50],
            'email_id': f"email_{idx}"
        }

        # Create a Document with the combined text and metadata
        doc = Document(
            text=row['combined_text'],
            metadata=metadata,
            id_=f"email_{idx}"
        )
        documents.append(doc)

    logger.info(f"Created {len(documents)} documents")
    return documents

def chunk_documents(documents: List[Document], chunk_size: int = 8192, chunk_overlap: int = 20) -> List[Document]:
    """
    Chunk documents into smaller nodes for better indexing

    Args:
        documents: List of documents to chunk
        chunk_size: Size of each chunk (increased to handle large metadata)
        chunk_overlap: Overlap between chunks

    Returns:
        List of chunked documents
    """
    logger.info(f"Chunking documents with size={chunk_size}, overlap={chunk_overlap}")

    # Extract only essential metadata to reduce size
    for doc in documents:
        # Keep only essential metadata and limit their size
        essential_metadata = {
            'sender': doc.metadata.get('sender', '')[:100],
            'date': doc.metadata.get('date', '')[:30],
            'subject': doc.metadata.get('subject', '')[:200],
            'label': doc.metadata.get('label', '')[:50]
        }
        # Replace the original metadata with the trimmed version
        doc.metadata = essential_metadata

    parser = SimpleNodeParser.from_defaults(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    nodes = parser.get_nodes_from_documents(documents)
    logger.info(f"Created {len(nodes)} nodes from {len(documents)} documents")

    return nodes

def create_index_from_documents(nodes: List[TextNode]) -> VectorStoreIndex:
    """
    Create a LlamaIndex VectorStoreIndex from documents

    Args:
        nodes: List of nodes to index

    Returns:
        Initialized VectorStoreIndex
    """
    logger.info("Creating vector index...")

    index: VectorStoreIndex = VectorStoreIndex(nodes)
    logger.info("Vector index created successfully")

    return index

In [3]:
main_dir: str = os.path.dirname(os.getcwd())
datasets_dir: str = os.path.join(main_dir, "data", "datasets")

assert os.path.exists(datasets_dir), "Datasets directory not found"

dataset_paths: list[Path] = [Path(str(os.path.join(datasets_dir, f))) for f in os.listdir(datasets_dir)]

dataset_paths

[WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/CEAS_08.csv'),
 WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/Ling.csv'),
 WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/Nazario.csv'),
 WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/Nazario_5.csv'),
 WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/Nigerian_5.csv'),
 WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/Nigerian_Fraud.csv'),
 WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/SpamAssasin.csv'),
 WindowsPath('C:/Users/ignat/PycharmProjects/phishing_email_detector_sandbox/data/datasets/TREC_07.csv')]

In [8]:
sample_size: int = 5
samples: list[tuple[DataFrame, DataFrame]] = [load_sample(path, sample_size=sample_size) for path in dataset_paths]

spam_dfs, ham_dfs = zip(*samples)
spam_df: DataFrame = pd.concat(spam_dfs, ignore_index=True)
ham_df: DataFrame = pd.concat(ham_dfs, ignore_index=True)

processed_spam  = preprocess_email_data(spam_df)
processed_ham  = preprocess_email_data(ham_df)

2025-04-22 14:35:42,863 - INFO - Successfully loaded CSV with 39154 rows and 7 columns
2025-04-22 14:35:42,991 - INFO - Successfully loaded CSV with 2859 rows and 3 columns
2025-04-22 14:35:43,164 - INFO - Successfully loaded CSV with 1565 rows and 7 columns
2025-04-22 14:35:43,330 - INFO - Successfully loaded CSV with 3065 rows and 7 columns
2025-04-22 14:35:43,597 - INFO - Successfully loaded CSV with 6331 rows and 7 columns
2025-04-22 14:35:43,751 - INFO - Successfully loaded CSV with 3332 rows and 7 columns
2025-04-22 14:35:43,945 - INFO - Successfully loaded CSV with 5809 rows and 7 columns
2025-04-22 14:35:45,499 - INFO - Successfully loaded CSV with 53757 rows and 7 columns
2025-04-22 14:35:45,543 - INFO - Preprocessing email data...
2025-04-22 14:35:45,547 - INFO - Preprocessing complete
2025-04-22 14:35:45,548 - INFO - Preprocessing email data...
2025-04-22 14:35:45,552 - INFO - Preprocessing complete


In [11]:
processed_spam.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text
0,Debra Puckett <Debra@tea-cegos.es>,user8.2-ext2@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 17:21:25 +0300",Luxuriate in beauty and style!,"If you buy watch in our store, nobody will bel...",1,1,From: Debra Puckett <Debra@tea-cegos.es>\nTo: ...
1,Czibolya <>,user7-ext3@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 06:05:09 -0400",Paris Hilton likes them big,A perfect score for our organ enhancement prod...,1,1,From: Czibolya <>\nTo: user7-ext3@gvc.ceas-cha...
2,kalvin tudor <long@baerenreiter.com>,user3@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 09:54:39 +0000",helpful for your health.,\nDear cbd19add6be5f8429c9de27201b0c756\n\nSum...,1,0,From: kalvin tudor <long@baerenreiter.com>\nTo...
3,Patty Crocker <Patty@gxis.de>,user8.2@gvc.ceas-challenge.cc,"Thu, 07 Aug 2008 13:03:01 +0300",Increase its length and stamina,Our researchers have made something that will ...,1,1,From: Patty Crocker <Patty@gxis.de>\nTo: user8...
4,Donna Rasmussen <Donna@anafi.it>,user8.2-ext1@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 13:31:35 +0700",She will dream of you every night,You can now easily bring your problem of small...,1,1,From: Donna Rasmussen <Donna@anafi.it>\nTo: us...


In [12]:
processed_ham.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text
0,Alexander Belopolsky <ynarxoxga.kwehevgyna@gma...,zvllln-eum@python.org,"Wed, 06 Aug 2008 12:32:11 +0000",Re: [Python-Dev] New/Old class exception pitfall,Oleg Broytmann phd.pp.ru> writes:\n\n> \n> On...,0,1,From: Alexander Belopolsky <ynarxoxga.kwehevgy...
1,Brian Evans <uxjqjubd@scent-team.com>,rljdsng-xyqoh@postfix.org,"Thu, 07 Aug 2008 21:27:26 -0500",Re: connection limit overrides,J.D. Bronson wrote:\n> I am at a loss here. I ...,0,1,From: Brian Evans <uxjqjubd@scent-team.com>\nT...
2,Joe Sloan <njk@tmsusa.com>,wkilxloc@opensuse.org,"Thu, 07 Aug 2008 00:46:09 -0800",Re: [opensuse] Netflix Queue Manager,Rick Friedman wrote:\n> Does anyone know of an...,0,0,From: Joe Sloan <njk@tmsusa.com>\nTo: wkilxloc...
3,Thomas Wouters <fgxflg@python.org>,Guido van Rossum <hoauf@python.org>,"Wed, 06 Aug 2008 02:03:00 -0700",Re: [Python-3000] Using *a for packing in list...,"On Sat, Mar 15, 2008 at 3:21 PM, Guido van Ros...",0,1,From: Thomas Wouters <fgxflg@python.org>\nTo: ...
4,Kevin Cosgrove <mwrlcg@doink.com>,Discussion list for EXMH users <ndqq-lmbvh@red...,"Wed, 06 Aug 2008 08:04:12 -0700",Re: replying to text/plain base64,"\nOn 17 October 2007 at 1:43, eorcsj.ciiddwmoy...",0,1,From: Kevin Cosgrove <mwrlcg@doink.com>\nTo: D...


In [13]:
documents_spam: list[Document] = create_documents_from_dataframe(processed_spam)
documents_ham: list[Document] = create_documents_from_dataframe(processed_ham)

2025-04-22 14:40:46,809 - INFO - Creating LlamaIndex documents from DataFrame...
2025-04-22 14:40:46,817 - INFO - Created 40 documents
2025-04-22 14:40:46,818 - INFO - Creating LlamaIndex documents from DataFrame...
2025-04-22 14:40:46,824 - INFO - Created 30 documents


In [14]:
nodes_spam: list[TextNode] = chunk_documents(documents_spam)
nodes_ham: list[TextNode] = chunk_documents(documents_ham)

2025-04-22 14:41:04,057 - INFO - Chunking documents with size=8192, overlap=20
2025-04-22 14:41:04,415 - INFO - Created 40 nodes from 40 documents
2025-04-22 14:41:04,418 - INFO - Chunking documents with size=8192, overlap=20
2025-04-22 14:41:04,470 - INFO - Created 30 nodes from 30 documents


In [15]:
embed_model: HuggingFaceEmbedding = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model

index_spam: VectorStoreIndex= create_index_from_documents(nodes_spam)
index_ham: VectorStoreIndex= create_index_from_documents(nodes_ham)

2025-04-22 14:41:25,317 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-04-22 14:41:27,620 - INFO - 2 prompts are loaded, with the keys: ['query', 'text']
2025-04-22 14:41:27,625 - INFO - Creating vector index...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-22 14:41:35,083 - INFO - Vector index created successfully
2025-04-22 14:41:35,084 - INFO - Creating vector index...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-22 14:41:39,916 - INFO - Vector index created successfully


In [17]:
index_spam.storage_context.persist(persist_dir="../data/index_spam/")
index_ham.storage_context.persist(persist_dir="../data/index_ham/")