In [1]:
import os
import pandas as pd
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

from llama_index.core import Document, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode

In [2]:
from rag_model.core.index import *
import config

In [3]:
datasets_dir: str = Path("../..") / config.DATASETS_PATH

assert os.path.exists(datasets_dir), "Datasets directory not found"

dataset_paths: list[Path] = [Path(str(os.path.join(datasets_dir, f))) for f in os.listdir(datasets_dir)]

dataset_paths

[WindowsPath('../../data/datasets/CEAS_08.csv'),
 WindowsPath('../../data/datasets/Ling.csv'),
 WindowsPath('../../data/datasets/Nazario.csv'),
 WindowsPath('../../data/datasets/Nazario_5.csv'),
 WindowsPath('../../data/datasets/Nigerian_5.csv'),
 WindowsPath('../../data/datasets/Nigerian_Fraud.csv'),
 WindowsPath('../../data/datasets/SpamAssasin.csv'),
 WindowsPath('../../data/datasets/TREC_07.csv')]

In [4]:
sample_size: int = 100
samples: list[tuple[pd.DataFrame, pd.DataFrame]] = [load_sample(path, sample_size=sample_size) for path in dataset_paths]

spam_dfs, ham_dfs = zip(*samples)
spam_df: pd.DataFrame = pd.concat(spam_dfs, ignore_index=True)
ham_df: pd.DataFrame = pd.concat(ham_dfs, ignore_index=True)

processed_spam  = preprocess_email_data(spam_df)
processed_ham  = preprocess_email_data(ham_df)

2025-04-26 13:23:11,801 - INFO - Successfully loaded CSV with 39154 rows and 7 columns
2025-04-26 13:23:12,070 - INFO - Successfully loaded CSV with 2859 rows and 3 columns
2025-04-26 13:23:12,231 - INFO - Successfully loaded CSV with 1565 rows and 7 columns
2025-04-26 13:23:12,396 - INFO - Successfully loaded CSV with 3065 rows and 7 columns
2025-04-26 13:23:12,667 - INFO - Successfully loaded CSV with 6331 rows and 7 columns
2025-04-26 13:23:12,822 - INFO - Successfully loaded CSV with 3332 rows and 7 columns
2025-04-26 13:23:13,030 - INFO - Successfully loaded CSV with 5809 rows and 7 columns
2025-04-26 13:23:14,556 - INFO - Successfully loaded CSV with 53757 rows and 7 columns
2025-04-26 13:23:14,600 - INFO - Preprocessing email data...
2025-04-26 13:23:14,621 - INFO - Preprocessing complete
2025-04-26 13:23:14,622 - INFO - Preprocessing email data...
2025-04-26 13:23:14,639 - INFO - Preprocessing complete


In [5]:
processed_spam.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text
0,Connie Kinney <Connie@hkusua.hku.hk>,user2.11@gvc.ceas-challenge.cc,"Thu, 07 Aug 2008 00:58:38 -2200",More massive love luger,The best way to please your lassie like a real...,1,1,From: Connie Kinney <Connie@hkusua.hku.hk>\nTo...
1,Sylvester Malone <SalvatoreobsoleteMalone@hilt...,user2.10@gvc.ceas-challenge.cc,"Thu, 07 Aug 2008 13:33:22 +0500",No Pumps! No Surgery! No Exercises!,\nMany surveys have shown that ladies prefer t...,1,1,From: Sylvester Malone <SalvatoreobsoleteMalon...
2,Cyrus Haley <Cyrus@woodwardps.net>,user7-ext4@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 02:27:13 +0200",What will lead to your super satisfaction?,Taking this remedy for a few months will preve...,1,1,From: Cyrus Haley <Cyrus@woodwardps.net>\nTo: ...
3,Scottie Mercer <RandellcolloidalSykes@stuff4re...,user2.4@gvc.ceas-challenge.cc,"Thu, 07 Aug 2008 08:01:17 -0300",Christmas Replica Watches,\nWe offer a free gift box with every VIP watc...,1,1,From: Scottie Mercer <RandellcolloidalSykes@st...
4,CNN Alerts <martac-hcelbhco@networkingcolorado...,catchall@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 07:50:02 -0400",CNN Alerts: My Custom Alert,\n\nCNN Alerts: My Custom Alert\n\n\n\n\n\n\n ...,1,0,From: CNN Alerts <martac-hcelbhco@networkingco...


In [6]:
processed_ham.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text
0,dan_markx <hkh_wavda@yahoo.com>,bxcbhgwfhe-gfonxob@yahoogroups.com,"Fri, 08 Aug 2008 06:41:04 +0000",[domainkeys-interop] Re: Dk php implementation...,"--- In bxcbhgwfhe-gfonxob@yahoogroups.com, ""Ja...",0,0,From: dan_markx <hkh_wavda@yahoo.com>\nTo: bxc...
1,Klaas-Jan Stol <parrotcode@gmail.com>,Jonathan Worthington <jonathan@jnthn.net>,"Thu, 07 Aug 2008 09:57:49 +0100",Re: [perl #57636] [TODO][PDD19] Document the r...,"On Wed, Aug 6, 2008 at 8:30 PM, Jonathan Worth...",0,0,From: Klaas-Jan Stol <parrotcode@gmail.com>\nT...
2,Jan Peters <cggc@jan-peters.net>,,"Fri, 08 Aug 2008 06:18:56 +0200",[UAI] NIPS 2007 WORKSHOP: Robotics Challenges ...,*** Apologies for Multipl...,0,1,From: Jan Peters <cggc@jan-peters.net>\nTo: \n...
3,Rahul Garg <qldb3@ualberta.ca>,"pxgdk-tmoguskofw@scipy.org, zvllln-eum@python.org","Thu, 07 Aug 2008 03:26:01 -0600",[Python-Dev] New project : Spyke python-to-C c...,Note this message has been posted to numpy-dis...,0,1,From: Rahul Garg <qldb3@ualberta.ca>\nTo: pxgd...
4,Neal Norwitz <vapkwvjq@gmail.com>,Lennart Regebro <hyiffbi@gmail.com>,"Wed, 06 Aug 2008 19:03:02 -0700",Re: [Python-3000] Python 3.0 Porting Strategies,"On Thu, Mar 27, 2008 at 3:52 AM, Lennart Regeb...",0,1,From: Neal Norwitz <vapkwvjq@gmail.com>\nTo: L...


In [7]:
documents_spam: list[Document] = create_documents_from_dataframe(processed_spam)
documents_ham: list[Document] = create_documents_from_dataframe(processed_ham)

2025-04-26 13:25:18,896 - INFO - Creating LlamaIndex documents from DataFrame...
2025-04-26 13:25:18,991 - INFO - Created 800 documents
2025-04-26 13:25:18,991 - INFO - Creating LlamaIndex documents from DataFrame...
2025-04-26 13:25:19,060 - INFO - Created 600 documents


In [8]:
nodes_spam: list[TextNode] = chunk_documents(documents_spam)
nodes_ham: list[TextNode] = chunk_documents(documents_ham)

2025-04-26 13:25:20,727 - INFO - Chunking documents with size=8192, overlap=20
2025-04-26 13:25:22,520 - INFO - Created 817 nodes from 800 documents
2025-04-26 13:25:22,521 - INFO - Chunking documents with size=8192, overlap=20
2025-04-26 13:25:23,459 - INFO - Created 611 nodes from 600 documents


In [9]:
embed_model: HuggingFaceEmbedding = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model

index_spam: VectorStoreIndex= create_index_from_documents(nodes_spam)
index_ham: VectorStoreIndex= create_index_from_documents(nodes_ham)

2025-04-26 13:25:28,609 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-04-26 13:25:31,001 - INFO - 2 prompts are loaded, with the keys: ['query', 'text']
2025-04-26 13:25:31,004 - INFO - Creating vector index...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [13]:
index_spam.storage_context.persist(persist_dir=Path("../..") / config.INDEX_SPAM_PATH)
index_ham.storage_context.persist(persist_dir=Path("../..") / config.INDEX_HAM_PATH)