In [4]:
import pandas as pd

# Adjust the file path to where the CSV is stored locally
df = pd.read_csv("spam_and_ham_classification.csv")

print(df.shape)            # e.g., (some_number_of_rows, 3)
print(df.columns)          # Expecting columns such as ['label', 'text', ...]
print(df.head(5))          # Preview the first few rows
print(df["label"].value_counts())  # Confirm it’s balanced: equal counts of 'spam' and 'ham'


(9989, 2)
Index(['label', 'text'], dtype='object')
  label                                               text
0   ham  into the kingdom of god and those that are ent...
1  spam  there was flow at hpl meter 1505 on april firs...
2   ham  take a look at this one campaign for bvyhprice...
3  spam  somu wrote actually thats what i was looking f...
4  spam  fathi boudra wrote i fixed the issue in the sv...
label
ham     5294
spam    4695
Name: count, dtype: int64


In [5]:
import re

def clean_text(text):
    # 1. Convert to lowercase
    text = text.lower()
    # 2. Remove URLs
    text = re.sub(r"http\S+", "", text)
    # 3. Remove non-alphanumeric characters (keep spaces)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    # 4. Collapse multiple spaces into one
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply the cleaning function to the entire DataFrame
df["clean_text"] = df["text"].apply(clean_text)

# Quick check of the new column
df[["clean_text", "label"]].head()


Unnamed: 0,clean_text,label
0,into the kingdom of god and those that are ent...,ham
1,there was flow at hpl meter 1505 on april firs...,spam
2,take a look at this one campaign for bvyhprice...,ham
3,somu wrote actually thats what i was looking f...,spam
4,fathi boudra wrote i fixed the issue in the sv...,spam


In [6]:
from sklearn.model_selection import train_test_split

# First split: 80% train, 20% temp (val + test)
train_df, temp_df = train_test_split(
    df,
    test_size=0.20,
    stratify=df["label"],
    random_state=13
)

# Second split: 10% validation, 10% test (each 50% of temp_df)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["label"],
    random_state=13
)

# Confirm shapes and label distributions
print("Train shape:", train_df.shape)
print(train_df["label"].value_counts(normalize=True))
print("Validation shape:", val_df.shape)
print(val_df["label"].value_counts(normalize=True))
print("Test shape:", test_df.shape)
print(test_df["label"].value_counts(normalize=True))


Train shape: (7991, 3)
label
ham     0.529971
spam    0.470029
Name: proportion, dtype: float64
Validation shape: (999, 3)
label
ham     0.52953
spam    0.47047
Name: proportion, dtype: float64
Test shape: (999, 3)
label
ham     0.530531
spam    0.469469
Name: proportion, dtype: float64


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    min_df=3,
    max_df=0.85,
    ngram_range=(1, 2)
)

# Fit on the training set, then transform all three
X_train = vectorizer.fit_transform(train_df["clean_text"])
X_val   = vectorizer.transform(val_df["clean_text"])
X_test  = vectorizer.transform(test_df["clean_text"])

# Encode labels as binary (ham=0, spam=1)
y_train = train_df["label"].map({"ham": 0, "spam": 1}).values
y_val   = val_df["label"].map({"ham": 0, "spam": 1}).values
y_test  = test_df["label"].map({"ham": 0, "spam": 1}).values

# Print shapes to confirm
print("X_train shape:", X_train.shape)
print("X_val shape:  ", X_val.shape)
print("X_test shape: ", X_test.shape)
print("y_train length:", len(y_train))
print("y_val length:  ", len(y_val))
print("y_test length: ", len(y_test))


X_train shape: (7991, 105579)
X_val shape:   (999, 105579)
X_test shape:  (999, 105579)
y_train length: 7991
y_val length:   999
y_test length:  999
