This notebook contains a set of transformations on the training set without the use of pipelines.

In [246]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Constants
data_file_path = "./data/home-data-for-ml-course/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0

# Load data
df = pd.read_csv(data_file_path)

# Target and features
y = df.SalePrice

# All numeric without missing values
features = list(set(df.columns) - set(["SalePrice"]))
X = df[features]

# Splitting
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=val_size, random_state=random_state)

In [247]:
# Writing source code that applies the same several preprocessing steps to different
# datasets can quickly become messy.

# Suppose we want to do the following preprocessing on any one dataset:
# - Track missing values
# - Impute categorical features with most frequent
# - One-hot encode all categorical features with 10 or less unique values
# - Ordinal encode all other categorical features
# - Impute missing values of numerical features with mean
# - Standard-scale all numerical variables

# Define some utility functions
def columns_with_missing_values(df):
    """Get list of columns with missing values"""
    missing_value_counts = df.isnull().sum()
    return list(missing_value_counts[missing_value_counts > 0].index)

In [248]:
# Get categorical and numerical column names
num_cols = list(train_X.select_dtypes(include=["number"]).columns)
cat_cols = list(set(train_X.columns) - set(num_cols))

# Track missing values
train_X_orig = train_X.copy()
for col in columns_with_missing_values(train_X):
    train_X[col + "_missing"] = train_X[col].isnull().astype(int)

In [249]:
# Preprocess numerical features

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Impute numerical columns with mean
num_feat = train_X[num_cols]
num_feat_imputed = pd.DataFrame(data=SimpleImputer().fit_transform(num_feat),
                                columns=num_feat.columns,
                                index=num_feat.index)
train_X = train_X.drop(labels=num_cols, axis=1)
train_X = pd.concat([train_X, num_feat_imputed], axis=1)

# Standard-scale numerical features
num_feat = train_X[num_cols]
num_feat_scaled = pd.DataFrame(data=StandardScaler().fit_transform(num_feat),
                               columns=num_feat.columns,
                               index=num_feat.index)
train_X = train_X.drop(labels=num_cols, axis=1)
train_X = pd.concat([train_X, num_feat_scaled], axis=1)

In [250]:
# Preprocess categorical features

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Categorical columns and their unique counts
cat_feat = train_X[cat_cols]

# Impute missing variables
missing = cat_feat.isnull().sum()
missing = list(missing[missing > 0].index)
cat_feat_missing = cat_feat[missing]
cat_feat_imputed = pd.DataFrame(data=SimpleImputer(strategy="most_frequent").fit_transform(cat_feat_missing),
                                     columns=cat_feat_missing.columns,
                                     index=cat_feat_missing.index)
train_X = train_X.drop(labels=missing, axis=1)
train_X = pd.concat([train_X, cat_feat_imputed], axis=1)

# One-hot encode only categorical variables with 10 or less distinct values
unique_counts = cat_feat.nunique()
cols_to_ohe = list(unique_counts[unique_counts <= 10].index)
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
cat_feat_ohe = ohe.fit_transform(cat_feat[cols_to_ohe])
cat_feat_ohe = pd.DataFrame(data=cat_feat_ohe,
                            columns=ohe.get_feature_names_out(),
                            index=cat_feat.index)
train_X = train_X.drop(labels=cols_to_ohe, axis=1)
train_X = pd.concat([train_X, cat_feat_ohe], axis=1)

# Ordinal encode the rest of the columns
cols_to_oe = list(set(cat_cols) - set(cols_to_ohe))
oe = OrdinalEncoder()
cat_feat_oe = oe.fit_transform(cat_feat[cols_to_oe])
cat_feat_oe = pd.DataFrame(data=cat_feat_oe,
                           columns=oe.get_feature_names_out(),
                           index=cat_feat.index)
train_X = train_X.drop(labels=cols_to_oe, axis=1)
train_X = pd.concat([train_X, cat_feat_oe], axis=1)


In [251]:
# Check for columns that are not in the transformed training set but that are in the original
cols_diff = list(set(train_X_orig.columns) - set(train_X.columns))

# For every column not found in the transformed training set, check if there are columns
# in the transformed training set that start with the column name. They could have been one-hot encoded.
cols_not_found = []
for col in cols_diff:
    if not(train_X.columns.str.startswith(col).any()):
        # None of the columns in the transformed training set start with the column name
        cols_not_found.append(col)
assert len(cols_not_found) == 0, f"Columns in the original training data not found. {cols_not_found}"

# All columns in the original training set are found

In [252]:
# Check that the transformed training set does not have missing columns
assert len(columns_with_missing_values(train_X)) == 0, "Transformed training set still has missing columns"

# Check that all columns with missing data in the orignial set have a "_missing"
# column in the transformed training set
prob_cols = []
for col in columns_with_missing_values(train_X_orig):
    if not(np.array(train_X.columns == f"{col}_missing").any()):
        prob_cols.append(col)
assert len(prob_cols) == 0, f"Some columns with missing values were not tracked. {prob_cols}"

In [260]:
# Check that all categorical variables were properly encoded
assert len(train_X.select_dtypes(exclude=["number"]).columns) == 0, \
    "Transformed training set still has non-numeric types"

In [267]:
# Check that all numerical features were standard scaled
assert train_X[num_cols].mean().abs().between(0, 1e-6).all(), \
    "Some numerical features do not have a mean close to 0"

In [269]:
# Now all this needs to be repeated again for the validation set.
# Pipelines are a better way of packaging these transformations so that they can be applied
# to multiple sets of data properly