In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Constants
data_file_path = "./data/home-data-for-ml-course/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0

# Load data
df = pd.read_csv(data_file_path)

# Target and features
y = df.SalePrice

# All numeric without missing values
features_no_missing = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
features = features_no_missing + ['LotFrontage','MasVnrArea','GarageYrBlt'] # numeric types with missing columns
X = df[features]

# Splitting
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=val_size, random_state=random_state)

In [44]:
# Suppose we want to create a Transformer with our own custom logic.
# Below is a transformer that imputes with the mean and creates additional columns that
# track whether an instance has missing values.
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import SimpleImputer
class TrackingImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    @staticmethod
    def columns_with_missing_values(df):
        """Get list of columns with missing values"""
        missing_value_counts = df.isnull().sum()
        return list(missing_value_counts[missing_value_counts > 0].index)
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        cm = TrackingImputer.columns_with_missing_values(X)
        for c in cm:
            X[c + "_missing"] = X[c].isnull()
        si = SimpleImputer()
        return pd.DataFrame(si.fit_transform(X), columns = X.columns)

print(f"Shape before transformation: {train_X.shape}")
print(f"Columns with missing values before transformation: {TrackingImputer.columns_with_missing_values(train_X)}")
ti = TrackingImputer()
imputed_train_X = ti.transform(train_X)
TrackingImputer.columns_with_missing_values(imputed_train_X)
print(f"Shape after transformation: {imputed_train_X.shape}")
print(f"Columns with missing values after transformation: {TrackingImputer.columns_with_missing_values(imputed_train_X)}")

Shape before transformation: (934, 10)
Columns with missing values before transformation: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
Shape after transformation: (934, 13)
Columns with missing values after transformation: []
