In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re

In [19]:
df = pd.read_csv('sms+spam+collection/SMSSpamCollection', sep ='\t', header = None)

In [20]:
df

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [21]:
def preprocess_text(text):
    """
    Preprocess text data by:
    - Converting to lowercase
    - Removing special characters
    - Removing extra whitespace
    """
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [22]:
def prepare_data(df, text_column=1, label_column=0):
    """
    Prepare the data for modeling:
    - Preprocess text by calling preprocess_text function
    - Convert labels to binary
    - Split data into train/validation/test
    - Save splits to CSV files
    """
    # Create a copy of the dataframe
    df = df.copy()
    
    # Preprocess text
    df.iloc[:, text_column] = df.iloc[:, text_column].apply(preprocess_text)
    
    # Convert labels to binary (0 for ham, 1 for spam)
    df.iloc[:, label_column] = (df.iloc[:, label_column] == 'spam').astype(int)
    df.iloc[:, text_column] = df.iloc[:, text_column].fillna('')
        
    # Split into train+validation and test (80-20 split)
    train_val, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.iloc[:, label_column])
    
    # Split train+validation into train and validation (75-25 split of the 80%)
    train, validation = train_test_split(train_val, test_size=0.25, random_state=42, 
                                       stratify=train_val.iloc[:, label_column])
    
    # Save splits to CSV
    train.to_csv('train.csv', index=False)
    validation.to_csv('validation.csv', index=False)
    test.to_csv('test.csv', index=False)
    
    print(f"Data split sizes:")
    print(f"Train: {len(train)} samples")
    print(f"Validation: {len(validation)} samples")
    print(f"Test: {len(test)} samples")
    
    return train, validation, test

In [23]:
train, validation, test = prepare_data(df)

Data split sizes:
Train: 3342 samples
Validation: 1115 samples
Test: 1115 samples
