### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [1]:
# write your code from here
# Ques_7.ipynb - Updated with granular error handling and inline tests

import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.exceptions import NotFittedError

def preprocess_text(text):
    """Clean a single text message with error handling for non-string input."""
    if not isinstance(text, str):
        # Return empty string or you can raise an error
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

def load_sms_spam_data(url):
    """Load and return SMS Spam data from the given URL with error handling."""
    try:
        df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
    except Exception as e:
        raise RuntimeError(f"Failed to load dataset from URL: {e}")
    
    # Check for null values
    if df.isnull().values.any():
        print("Warning: Null values found. Dropping...")
        df.dropna(inplace=True)

    return df

def prepare_data(df):
    """Preprocess and split the data into train/test sets."""
    df['clean_text'] = df['message'].apply(preprocess_text)

    # Encode labels (ham=0, spam=1)
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df['label'])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df['clean_text'], df['label_encoded'], test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test

def tfidf_vectorize(X_train, X_test):
    """Apply TF-IDF vectorization to the text data."""
    tfidf = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5)
    
    try:
        X_train_vec = tfidf.fit_transform(X_train)
        X_test_vec = tfidf.transform(X_test)
    except NotFittedError:
        raise RuntimeError("TF-IDF Vectorizer not fitted properly.")
    
    return X_train_vec, X_test_vec, tfidf

# ================== RUN PIPELINE ==================

URL = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"

# Load and prepare
df_sms = load_sms_spam_data(URL)
X_train, X_test, y_train, y_test = prepare_data(df_sms)

# Vectorize
X_train_tfidf, X_test_tfidf, tfidf_model = tfidf_vectorize(X_train, X_test)

# Output TF-IDF shape info
print("TF-IDF training features shape:", X_train_tfidf.shape)
print("TF-IDF test features shape:", X_test_tfidf.shape)


# ================== INLINE TESTS ==================

# Test preprocess_text function
assert preprocess_text("Hello, WORLD! 123") == "hello world", "Basic cleaning failed"
assert preprocess_text("  Spaces  \n") == "spaces", "Whitespace trimming failed"
assert preprocess_text(None) == "", "None input handling failed"
assert preprocess_text(12345) == "", "Non-string input handling failed"

# Test label encoding correctness
assert set(df_sms['label'].unique()) == set(['ham', 'spam']), "Unexpected labels"
assert set(df_sms['label_encoded'].unique()) == set([0,1]), "Label encoding failed"

print("All inline tests passed!")

TF-IDF training features shape: (4457, 1247)
TF-IDF test features shape: (1115, 1247)
All inline tests passed!
