In [413]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Constants
data_file_path = "./data/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0

# Load data
df = pd.read_csv(data_file_path)

# Target and features
target = "SalePrice"
y = df.SalePrice

# All numeric without missing values
features = list(set(df.columns) - set(["SalePrice"]))
X = df[features]

# Splitting
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=val_size, random_state=random_state)

In [414]:
# Writing source code that applies the same several preprocessing steps to different
# datasets can quickly become messy.

# Suppose we want to do the following preprocessing on any one dataset:
# - Track missing values
# - Impute categorical features with most frequent
# - One-hot encode all categorical features with 10 or less unique values
# - Ordinal encode all other categorical features
# - Impute missing values of numerical features with mean
# - Standard-scale all numerical variables

# Here are some custom transfromers.
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config

# Transformer outputs will now output DataFrames
set_config(transform_output="pandas")

In [415]:
class TrackingImputer(BaseEstimator, TransformerMixin):
    """
    Create a column that corresponds to a feature whose values
    indicate if a sample is missing the feature value
    """
    def __init__(self):
        pass
    @staticmethod
    def columns_with_missing_values(df):
        """Get list of columns with missing values"""
        missing_value_counts = df.isnull().sum()
        return list(missing_value_counts[missing_value_counts > 0].index)
    def fit(self, X, y=None):
        self.columns = self.columns_with_missing_values(X)
        return self
    def transform(self, X):
        X = X.copy()
        for c in self.columns:
            X[c + "_missing"] = X[c].isnull().astype(int)
        return X

In [416]:
def get_non_numeric_columns(df):
    """Get non-numeric columns"""
    return list(df.select_dtypes(exclude=["number"]).columns)

def get_low_cardinality_columns(df):
    """Get list of categorical columns 10 or less unique values"""
    cols = get_non_numeric_columns(df)
    cols_unq_cnts = df[cols].nunique()
    low_card_cols = list(cols_unq_cnts[cols_unq_cnts <= 10].index)
    return low_card_cols

In [417]:
# Get categorical and numerical columns
num_cols = list(train_X.select_dtypes(include=["number"]).columns)
cat_cols = list(set(train_X.columns) - set(num_cols))

In [418]:
# Define transformations for categorical features
low_card_cols = get_low_cardinality_columns(train_X)
cols_to_oe = list(set(cat_cols) - set(low_card_cols))

cat_pipeline = Pipeline(steps=[
    ("impute_most_frequent", SimpleImputer(strategy="most_frequent")),
    ("encode", ColumnTransformer(
        transformers=[
            ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False), low_card_cols),
            ("ordinal_encoder", OrdinalEncoder(dtype=np.int64,
                                               handle_unknown="use_encoded_value",
                                               unknown_value = -1), cols_to_oe)
        ]
        , remainder="passthrough", verbose_feature_names_out = False))
])

In [419]:
# Define transformations for numerical variables
num_pipeline = Pipeline(steps=[
    ("impute_mean", SimpleImputer()),
    ("standard_scale", StandardScaler())
])

In [421]:
# Assemble preprocessing steps
preprocessing = Pipeline(steps=[
    ("track_missing", TrackingImputer()),
    ("numerical_categorical_transformer", ColumnTransformer(transformers=[
            ("transform_categorical", cat_pipeline, cat_cols),
            ("transform_numerical", num_pipeline, num_cols)
        ],
        remainder="passthrough", verbose_feature_names_out = False)
    )
])

# Assemble final pipeline, fit and transform
mdl = Pipeline(steps=[
    ("preprocessing", preprocessing),
    ("model", RandomForestRegressor(random_state=0))
])

In [424]:
# Score on the validation set
mdl.fit(train_X, train_y)
print(mean_absolute_error(val_y, mdl.predict(val_X)))

17223.706111111114


In [None]:
# Verify the transformation

# Compare shape of original vs transformed
print("Original training shape = " + str(train_X.shape))
print("Transformed training shape = " + str(train_X_transformed.shape))

# Check that there are no missing values
missing = train_X_transformed.isnull().any()
cols_with_missing = missing[missing].index
assert len(missing[missing]) == 0, f"Some columns have missing values: {cols_with_missing}"

# Check that there are now no categorical types
cols = train_X_transformed.select_dtypes(exclude=["number"]).columns
assert len(cols) == 0, f"Some columns are not properly encoded: {cols}"

# Check that there are _missing columns
idx = train_X_transformed.columns.str.endswith("_missing")
assert len(idx[idx]) > 0, "Track missing columns _missing not found"

# Check that the original low cardinality columns are now missing using the intersection of sets
assert len(set(train_X_transformed.columns) & set(low_card_cols)) == 0, \
    "Some low cardinality columns are still found in the final set. They should have been one-hot encoded"

# Check that all low cardinality columns have new columns that start with the original column name
assert np.array([train_X_transformed.columns.str.startswith(col).any() for col in low_card_cols]).all(), \
    "Some low cardinality columns were not one-hot encoded properly"

# Check that all these columns are either 0 or 1
list_of_lists = [list(train_X_transformed.columns[train_X_transformed.columns.str.startswith(col)]) \
    for col in low_card_cols]
low_card_cols_transformed = [col for ls in list_of_lists for col in ls if not col.endswith("_missing")]
assert train_X_transformed[low_card_cols_transformed].isin([0, 1]).all().all(), \
    "Some a one-hot encoded low cardinality columns have a value different from 0 or 1"

# Check that all ordinal-encoded features are ints
assert (train_X_transformed[cols_to_oe].dtypes == np.int64).all(), \
    "Some ordinal encoded columns are not ints"

# Check that all ordinal-encoded features have more than 1 unique value
assert (train_X_transformed[cols_to_oe].nunique() > 1).all(), \
    "Some ordinal encoded columns have less than 1 unique value"
    
# Check that all numerical columns have mean of about 0
assert train_X_transformed[num_cols].describe().loc["mean",:].between(-1e-6, 1e-6).all(), \
    "Some numerical features were not mean-centered"