In [1]:
import os
import pandas as pd
from pathlib import Path
import logging
from typing import List, Dict, Any, Optional

from pandas import DataFrame
from sympy.stats.rv import sample_iter_lambdify

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# For LlamaIndex
from llama_index.core import Document, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode, TextNode
from llama_index.readers.file import CSVReader

In [2]:
def load_sample(file_path: Path, sample_size: Optional[int] = 5) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Safely load and validate CSV file, with proper error handling.

    Args:
        file_path: Path to the CSV file
        sample_size: Number of samples to take from each category (if available)

    Returns:
        Tuple of (spam_sample, ham_sample) DataFrames
    """

    if not file_path.exists():
        raise FileNotFoundError(f"CSV file not found at: {file_path}")

    try:
        # First read with pandas for validation and preprocessing
        df = pd.read_csv(file_path)
        logger.info(f"Successfully loaded CSV with {len(df)} rows and {len(df.columns)} columns")

        # Count samples in each category
        spam_count = (df["label"] == 1).sum()
        ham_count = (df["label"] == 0).sum()

        # Take minimum of requested sample size and available samples
        spam_sample_size = min(sample_size, spam_count)
        ham_sample_size = min(sample_size, ham_count)

        # Sample with adjusted sizes
        spam_sample = df[df["label"] == 1].sample(spam_sample_size) if spam_sample_size > 0 else pd.DataFrame()
        ham_sample = df[df["label"] == 0].sample(ham_sample_size) if ham_sample_size > 0 else pd.DataFrame()

        return spam_sample, ham_sample

    except Exception as e:
        logger.error(f"Error loading CSV file: {str(e)}")
        raise

def preprocess_email_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess email data for better indexing in LlamaIndex

    Args:
        df: DataFrame with raw email data

    Returns:
        Preprocessed DataFrame
    """
    logger.info("Preprocessing email data...")

    # Make a copy to avoid modifying the original
    processed_df = df.copy()

    # Fill empty fields with empty strings to avoid NaN issues
    processed_df = processed_df.fillna("")

    # Create a combined text field that LlamaIndex can use more effectively
    processed_df['combined_text'] = processed_df.apply(
        lambda row: (
            f"From: {row['sender']}\n"
            f"To: {row['receiver']}\n"
            f"Date: {row['date']}\n"
            f"Subject: {row['subject']}\n\n"
            f"{row['body']}"
        ), axis=1
    )

    logger.info("Preprocessing complete")
    return processed_df

def create_documents_from_dataframe(df: pd.DataFrame) -> List[Document]:
    """
    Convert DataFrame rows to LlamaIndex Document objects

    Args:
        df: Preprocessed DataFrame

    Returns:
        List of LlamaIndex Document objects
    """
    logger.info("Creating LlamaIndex documents from DataFrame...")

    documents = []

    for idx, row in df.iterrows():
        # Create metadata dictionary with only essential fields
        metadata = {
            'sender': str(row['sender'])[:100],
            'receiver': str(row['receiver'])[:100],
            'date': str(row['date'])[:30],
            'subject': str(row['subject'])[:200],
            'label': str(row.get('label', ''))[:50],
            'email_id': f"email_{idx}"
        }

        # Create a Document with the combined text and metadata
        doc = Document(
            text=row['combined_text'],
            metadata=metadata,
            id_=f"email_{idx}"
        )
        documents.append(doc)

    logger.info(f"Created {len(documents)} documents")
    return documents

def chunk_documents(documents: List[Document], chunk_size: int = 8192, chunk_overlap: int = 20) -> List[Document]:
    """
    Chunk documents into smaller nodes for better indexing

    Args:
        documents: List of documents to chunk
        chunk_size: Size of each chunk (increased to handle large metadata)
        chunk_overlap: Overlap between chunks

    Returns:
        List of chunked documents
    """
    logger.info(f"Chunking documents with size={chunk_size}, overlap={chunk_overlap}")

    # Extract only essential metadata to reduce size
    for doc in documents:
        # Keep only essential metadata and limit their size
        essential_metadata = {
            'sender': doc.metadata.get('sender', '')[:100],
            'date': doc.metadata.get('date', '')[:30],
            'subject': doc.metadata.get('subject', '')[:200],
            'label': doc.metadata.get('label', '')[:50]
        }
        # Replace the original metadata with the trimmed version
        doc.metadata = essential_metadata

    parser = SimpleNodeParser.from_defaults(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    nodes = parser.get_nodes_from_documents(documents)
    logger.info(f"Created {len(nodes)} nodes from {len(documents)} documents")

    return nodes

def create_index_from_documents(nodes: List[TextNode]) -> VectorStoreIndex:
    """
    Create a LlamaIndex VectorStoreIndex from documents

    Args:
        nodes: List of nodes to index

    Returns:
        Initialized VectorStoreIndex
    """
    logger.info("Creating vector index...")

    index: VectorStoreIndex = VectorStoreIndex(nodes)
    logger.info("Vector index created successfully")

    return index

In [3]:
from pathlib import Path
from data_paths import DATASETS_PATH

datasets_dir: str = Path("../..") / DATASETS_PATH

assert os.path.exists(datasets_dir), "Datasets directory not found"

dataset_paths: list[Path] = [Path(str(os.path.join(datasets_dir, f))) for f in os.listdir(datasets_dir)]

dataset_paths

[WindowsPath('../../data/CEAS_08.csv'),
 WindowsPath('../../data/Ling.csv'),
 WindowsPath('../../data/Nazario.csv'),
 WindowsPath('../../data/Nazario_5.csv'),
 WindowsPath('../../data/Nigerian_5.csv'),
 WindowsPath('../../data/Nigerian_Fraud.csv'),
 WindowsPath('../../data/SpamAssasin.csv'),
 WindowsPath('../../data/TREC_07.csv')]

In [10]:
sample_size: int = 100
samples: list[tuple[DataFrame, DataFrame]] = [load_sample(path, sample_size=sample_size) for path in dataset_paths]

spam_dfs, ham_dfs = zip(*samples)
spam_df: DataFrame = pd.concat(spam_dfs, ignore_index=True)
ham_df: DataFrame = pd.concat(ham_dfs, ignore_index=True)

processed_spam  = preprocess_email_data(spam_df)
processed_ham  = preprocess_email_data(ham_df)

2025-04-26 12:28:28,176 - INFO - Successfully loaded CSV with 39154 rows and 7 columns
2025-04-26 12:28:28,475 - INFO - Successfully loaded CSV with 2859 rows and 3 columns
2025-04-26 12:28:28,642 - INFO - Successfully loaded CSV with 1565 rows and 7 columns
2025-04-26 12:28:28,815 - INFO - Successfully loaded CSV with 3065 rows and 7 columns
2025-04-26 12:28:29,107 - INFO - Successfully loaded CSV with 6331 rows and 7 columns
2025-04-26 12:28:29,253 - INFO - Successfully loaded CSV with 3332 rows and 7 columns
2025-04-26 12:28:29,455 - INFO - Successfully loaded CSV with 5809 rows and 7 columns
2025-04-26 12:28:31,000 - INFO - Successfully loaded CSV with 53757 rows and 7 columns
2025-04-26 12:28:31,042 - INFO - Preprocessing email data...
2025-04-26 12:28:31,062 - INFO - Preprocessing complete
2025-04-26 12:28:31,062 - INFO - Preprocessing email data...
2025-04-26 12:28:31,082 - INFO - Preprocessing complete


In [11]:
processed_spam.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text
0,CNN Alerts <sumatra11960@quirant.com>,email622@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 05:20:09 -0400",CNN Alerts: My Custom Alert,\n\nCNN Alerts: My Custom Alert\n\n\n\n\n\n\n ...,1,0,From: CNN Alerts <sumatra11960@quirant.com>\nT...
1,perron <>,user8.2@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 08:53:03 +0200",Karma sutra techniques for G-spots,\nGrow longer and harder with our all natural ...,1,1,From: perron <>\nTo: user8.2@gvc.ceas-challeng...
2,Mariana Sampson <fsbsgoixpsbf@boyslieband.com>,user7-ext2@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 02:31:44 -0600",From Mariana Sampson,\n\n\n\n\n\n\nInst tao ant Online Payday Loans...,1,1,From: Mariana Sampson <fsbsgoixpsbf@boyslieban...
3,CNN Alerts <rivmeced@thefloridayachtclub.org>,user8.3@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 08:34:20 -0400",CNN Alerts: My Custom Alert,\n\n\nCNN Alerts: My Custom Alert\n\n\n\n\n\n\...,1,0,From: CNN Alerts <rivmeced@thefloridayachtclub...
4,Tommy Coffman <moroccox4@vdaalst.com>,user7-ext3@gvc.ceas-challenge.cc,"Thu, 07 Aug 2008 10:12:13 -0300",ToAllCountriesThankYouForOurCustomers,\nCanadianInternationalPharmacyFriendlySupport...,1,1,From: Tommy Coffman <moroccox4@vdaalst.com>\nT...


In [12]:
processed_ham.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text
0,jl@apache.org,niuxjhu@spamassassin.apache.org,"Wed, 06 Aug 2008 00:05:55 +0000",svn commit: r581582 - /spamassassin/trunk/rule...,Author: jm\nDate: Wed Oct 3 05:33:08 2007\nNe...,0,1,From: jl@apache.org\nTo: niuxjhu@spamassassin....
1,"""ICG Campbell, Engineering Mathematics"" <c.dzm...","rgkwpdahmhmail@cs.cmu.edu, rgrmu-dlqu@stat.cmu...","Thu, 07 Aug 2008 17:38:16 +0000",[UAI] Lectureship/Readership Available: Bristo...,Lecturership or Readership available in the:\n...,0,1,"From: ""ICG Campbell, Engineering Mathematics"" ..."
2,vvrpgs@porcupine.org,don johnson <httlfleq@gmail.com>,"Thu, 07 Aug 2008 06:47:17 -0500",Re: queue file write error for some domains,don johnson: > Feb 19 22:09:23 mail postfix/sm...,0,0,From: vvrpgs@porcupine.org\nTo: don johnson <h...
3,Jeff Hodsdon <adyvurq@gmail.com>,Leah Culver <pvyk.khuxqi@gmail.com>,"Wed, 06 Aug 2008 03:48:38 -0700",[PownceAPI] Re: OAuth Bug in Pownce API,"Ah, figured that is what is up!(expect it was ...",0,1,From: Jeff Hodsdon <adyvurq@gmail.com>\nTo: Le...
4,gabi dinu <nu@prutul.ro>,rljdsng-xyqoh@postfix.org,"Fri, 08 Aug 2008 09:51:03 +0200",Problem with forwarding email,this is my configuration:\n\n/etc/postfix/main...,0,0,From: gabi dinu <nu@prutul.ro>\nTo: rljdsng-xy...


In [13]:
documents_spam: list[Document] = create_documents_from_dataframe(processed_spam)
documents_ham: list[Document] = create_documents_from_dataframe(processed_ham)

2025-04-26 12:28:42,561 - INFO - Creating LlamaIndex documents from DataFrame...
2025-04-26 12:28:42,650 - INFO - Created 800 documents
2025-04-26 12:28:42,654 - INFO - Creating LlamaIndex documents from DataFrame...
2025-04-26 12:28:42,925 - INFO - Created 600 documents


In [14]:
nodes_spam: list[TextNode] = chunk_documents(documents_spam)
nodes_ham: list[TextNode] = chunk_documents(documents_ham)

2025-04-26 12:28:44,918 - INFO - Chunking documents with size=8192, overlap=20
2025-04-26 12:28:46,051 - INFO - Created 802 nodes from 800 documents
2025-04-26 12:28:46,052 - INFO - Chunking documents with size=8192, overlap=20
2025-04-26 12:28:46,729 - INFO - Created 605 nodes from 600 documents


In [15]:
embed_model: HuggingFaceEmbedding = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model

index_spam: VectorStoreIndex= create_index_from_documents(nodes_spam)
index_ham: VectorStoreIndex= create_index_from_documents(nodes_ham)

2025-04-26 12:28:49,253 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-04-26 12:28:51,042 - INFO - 2 prompts are loaded, with the keys: ['query', 'text']
2025-04-26 12:28:51,045 - INFO - Creating vector index...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-26 12:30:56,209 - INFO - Vector index created successfully
2025-04-26 12:30:56,212 - INFO - Creating vector index...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-04-26 12:32:42,443 - INFO - Vector index created successfully


In [16]:



index_spam.storage_context.persist(persist_dir=Path("../..") / INDEX_SPAM_PATH)
index_ham.storage_context.persist(persist_dir=Path("../..") / INDEX_HAM_PATH)