In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [2]:
def load_data(file_path):
    """
    Loads SMS Spam dataset from a file path.
    """
    df = pd.read_csv(
        file_path,
        sep="\t",
        header=None,
        names=["label", "text"]
    )
    return df

In [3]:
def preprocess_text(text):
    """
    Basic text cleaning.
    """
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [4]:
def preprocess_data(df):
    df = df.copy()
    df["text"] = df["text"].apply(preprocess_text)
    df["label"] = df["label"].map({"ham": 0, "spam": 1})
    return df

In [5]:
def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    assert train_size + val_size + test_size == 1.0
    
    train_df, temp_df = train_test_split(
        df,
        train_size=train_size,
        stratify=df["label"],
        random_state=random_state
    )

    val_df, test_df = train_test_split(
        temp_df,
        train_size=val_size / (val_size + test_size),
        stratify=temp_df["label"],
        random_state=random_state
    )

    return train_df, val_df, test_df


In [6]:
def save_splits(train_df, val_df, test_df):
    train_df.to_csv("train.csv", index=False)
    val_df.to_csv("validation.csv", index=False)
    test_df.to_csv("test.csv", index=False)


In [7]:
df = load_data("data/SMSSpamCollection")
df = preprocess_data(df)

train_df, val_df, test_df = split_data(df)
save_splits(train_df, val_df, test_df)

print("Train:", train_df.shape)
print("Validation:", val_df.shape)
print("Test:", test_df.shape)


Train: (3900, 2)
Validation: (836, 2)
Test: (836, 2)
