In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [2]:
# Load a regression dataset (Ames Housing)
housing = fetch_openml(name="house_prices", as_frame=True, parser='auto')
X_full = housing.data
y = housing.target

# For simplicity, we'll use only numeric features
X = X_full.select_dtypes(include=np.number)

# Drop other columns with missing values to focus on 'LotFrontage'
X.drop(['MasVnrArea', 'GarageYrBlt'], axis=1, inplace=True)

print(f"Remaining missing values in 'LotFrontage': {X['LotFrontage'].isnull().sum()}")

Remaining missing values in 'LotFrontage': 259


In [10]:
# a. Define the regression models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# b. Define the transformation pipelines to test
# Each pipeline first imputes, then transforms. This is a common and robust practice.
transformation_pipelines = {
    'Standard Scaler': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]),
    'MinMax Scaler': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ]),
    'Power Transformer (Yeo-Johnson)': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('transformer', PowerTransformer(method='yeo-johnson'))
    ])
}

# c. Define the regression scoring metrics
scoring_metrics = {
    'r2': 'r2',
    'mse': 'neg_mean_squared_error',
    'rmse': 'neg_root_mean_squared_error'
}

# d. Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}

    # a. "Feature Dropped" Evaluation
    X_dropped = X.drop(columns=['LotFrontage'])
    feature_dropped_scores = cross_validate(
        model, X_dropped, y, cv=cv_strategy,
        scoring=scoring_metrics, return_train_score=True
    )
    model_results['Feature Dropped'] = {
        'Train R2 Score': feature_dropped_scores['train_r2'].mean(),
        'CV R2 Score': feature_dropped_scores['test_r2'].mean(),
        'Train RMSE': -feature_dropped_scores['train_rmse'].mean(),
        'CV RMSE': -feature_dropped_scores['test_rmse'].mean()
    }

    # b. "Imputation Only" Baseline Evaluation
    baseline_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', model)
    ])
    baseline_scores = cross_validate(
        baseline_pipeline, X, y, cv=cv_strategy,
        scoring=scoring_metrics, return_train_score=True
    )
    model_results['Baseline (Imputation Only)'] = {
        'Train R2 Score': baseline_scores['train_r2'].mean(),
        'CV R2 Score': baseline_scores['test_r2'].mean(),
        'Train RMSE': -baseline_scores['train_rmse'].mean(),
        'CV RMSE': -baseline_scores['test_rmse'].mean()
    }

    # c. Transformation Pipelines Evaluation
    for tech_name, preprocessor in transformation_pipelines.items():
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        model_results[tech_name] = {
            'Train R2 Score': scores['train_r2'].mean(),
            'CV R2 Score': scores['test_r2'].mean(),
            'Train RMSE': -scores['train_rmse'].mean(),
            'CV RMSE': -scores['test_rmse'].mean()
        }

    # d. Consolidate and store results
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# e. Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
all_results['Generalization'] = all_results['CV R2 Score'] / all_results['Train R2 Score']
all_results = all_results.sort_values(by='CV R2 Score', ascending=False)

--- Evaluating Model: Linear Regression ---


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)


--- Evaluating Model: Random Forest ---


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)


--- Evaluating Model: Gradient Boosting ---


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)


In [13]:
# Set display options for better readability
pd.set_option('display.float_format', '{:.4f}'.format)

# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV R2 Score',
    'Train R2 Score',
    'Generalization',
    'CV RMSE',
    'Train RMSE'
]
all_results = all_results[final_columns_order]

all_results

Unnamed: 0,Model,Preprocessing Technique,CV R2 Score,Train R2 Score,Generalization,CV RMSE,Train RMSE
10,Gradient Boosting,Feature Dropped,0.843,0.9637,0.8747,29249.2788,15119.7029
5,Random Forest,Feature Dropped,0.8378,0.9794,0.8554,30310.1601,11350.4549
14,Gradient Boosting,Power Transformer (Yeo-Johnson),0.8368,0.9645,0.8676,29628.0641,14932.9035
6,Random Forest,Baseline (Imputation Only),0.8363,0.9793,0.854,30552.3105,11396.5658
8,Random Forest,MinMax Scaler,0.8362,0.9793,0.8539,30562.7564,11401.4999
7,Random Forest,Standard Scaler,0.8361,0.9792,0.8538,30577.0345,11409.8973
12,Gradient Boosting,Standard Scaler,0.8359,0.9638,0.8673,29744.1335,15083.531
13,Gradient Boosting,MinMax Scaler,0.8359,0.9638,0.8673,29746.2559,15083.531
11,Gradient Boosting,Baseline (Imputation Only),0.8359,0.9638,0.8673,29747.4698,15083.531
9,Random Forest,Power Transformer (Yeo-Johnson),0.8341,0.9796,0.8514,30556.3097,11304.7667
