In [35]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [36]:
# Import data set
df = pd.read_csv('../data/artDataset_preprocessed.csv')

# Let's transform the data set into a numpy array
data_array = df.to_numpy()

# Predictors
X = data_array[:,1:]

# Target
y = data_array[:,0]

# **1. Baseline Model: Mean Predictor**

Let's apply a linear regression model with no features, i.e. it computes the mean of y on the training data, and use this value to predict y on the test data

In [37]:
# Implement a baseline model (mean predictor)
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer

# Set up K-Fold Cross-Validation
k_folds = 5
cv_strategy = KFold(n_splits=k_folds, shuffle=True, random_state=17)

# Initialize the baseline model (predicts the mean of y on the training set and uses it to predict y on the test set)
baseline_model = DummyRegressor(strategy='mean')

# Perform cross-validation on the baseline model
baseline_cv_scores_r2 = cross_val_score(baseline_model, X, y, cv=cv_strategy, scoring='r2')
baseline_cv_scores_neg_mse = cross_val_score(baseline_model, X, y, cv=cv_strategy, scoring='neg_mean_squared_error')

# Convert negative MSE to positive MSE and then to RMSE
baseline_cv_scores_mse = -baseline_cv_scores_neg_mse
baseline_cv_scores_rmse = np.sqrt(baseline_cv_scores_mse)

# Define MAE scorer
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
baseline_cv_scores_mae = -cross_val_score(baseline_model, X, y, cv=cv_strategy, scoring=mae_scorer)

# Print baseline cross-validation results
print("\nBaseline Model (Mean Predictor) Cross-Validation Results:\n")
print(f"R² scores for each fold: {np.round(baseline_cv_scores_r2,4)}")
print(f"Mean R²: {baseline_cv_scores_r2.mean():.4f} (±{baseline_cv_scores_r2.std():.4f})\n")
print(f"RMSE for each fold: {np.round(baseline_cv_scores_rmse,2)}")
print(f"Mean RMSE: {baseline_cv_scores_rmse.mean():.4f} (±{baseline_cv_scores_rmse.std():.4f})\n")
print(f"MAE for each fold: {np.round(baseline_cv_scores_mae,2)}")
print(f"Mean MAE: {baseline_cv_scores_mae.mean():.4f} (±{baseline_cv_scores_mae.std():.4f})")



Baseline Model (Mean Predictor) Cross-Validation Results:

R² scores for each fold: [-0.0033 -0.001  -0.0008 -0.0035 -0.0001]
Mean R²: -0.0017 (±0.0014)

RMSE for each fold: [14133.18 15068.01 11191.55  9548.71 13583.5 ]
Mean RMSE: 12704.9895 (±2032.1394)

MAE for each fold: [7348.57 6851.1  6573.82 6568.15 6568.87]
Mean MAE: 6782.1019 (±303.4041)


We observe that this models r2 is lightly below zero, indicating the mean‐only predictor performs worse than using the sample mean of the full dataset

## **2. Baseline Model: OLS**

Both the scaled and non-scaled models should have the same (similar) performance.

### Non-scaled

In [38]:
# Implement OLS baseline model
from sklearn.linear_model import LinearRegression
import numpy as np

# Initialize the OLS model
ols_model = LinearRegression()

# Use the same K-Fold Cross-Validation
k_folds = 5
cv_strategy = KFold(n_splits=k_folds, shuffle=True, random_state=17)

# Perform cross-validation on the OLS model
ols_cv_scores_r2 = cross_val_score(ols_model, X, y, cv=cv_strategy, scoring='r2')
ols_cv_scores_neg_mse = cross_val_score(ols_model, X, y, cv=cv_strategy, scoring='neg_mean_squared_error')

# Convert negative MSE to positive MSE and then to RMSE
ols_cv_scores_mse = -ols_cv_scores_neg_mse
ols_cv_scores_rmse = np.sqrt(ols_cv_scores_mse)

# Define MAE scorer
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
ols_cv_scores_mae = -cross_val_score(ols_model, X, y, cv=cv_strategy, scoring=mae_scorer)

# Print OLS cross-validation results
print("\nOLS Model Cross-Validation Results:\n")
print(f"R² scores for each fold: {np.round(ols_cv_scores_r2,4)}")
print(f"Mean R²: {ols_cv_scores_r2.mean():.4f} (±{ols_cv_scores_r2.std():.4f})\n")
print(f"RMSE for each fold: {np.round(ols_cv_scores_rmse,2)}")
print(f"Mean RMSE: {ols_cv_scores_rmse.mean():.4f} (±{ols_cv_scores_rmse.std():.4f})\n")
print(f"MAE for each fold: {np.round(ols_cv_scores_mae,2)}")
print(f"Mean MAE: {ols_cv_scores_mae.mean():.4f} (±{ols_cv_scores_mae.std():.4f})")


OLS Model Cross-Validation Results:

R² scores for each fold: [ 0.1311  0.0906  0.1352 -0.4952  0.0076]
Mean R²: -0.0261 (±0.2390)

RMSE for each fold: [13152.69 14361.98 10403.11 11655.57 13531.56]
Mean RMSE: 12620.9816 (±1413.7850)

MAE for each fold: [6234.76 5612.13 5576.47 6764.39 5876.08]
Mean MAE: 6012.7639 (±443.5692)


### Scaled

At first, I obtained wildly different (even astronomically large) R² and RMSE when I threw scaling into the mix. In theory, an ordinary least‐squares fit is invariant to affine rescaling of the features: if we scale every column by a constant and then un‐scale the coefficients, predictions end up exactly the same, so the R²/MAE/RMSE shouldn’t change.

When we get huge negatives like –1e14 for R² or RMSE on the order of 1e11, it almost always means that your pipeline has become numerically unstable.

Solution: get rid of the features with null variance or establish a threshold

In [39]:
# 5-fold setup
kf = KFold(n_splits=5, shuffle=True, random_state=14)

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("var_thresh", VarianceThreshold()),         # removes features with σ=0
    ("scaler",    StandardScaler()),
    ("ols",       LinearRegression())
])

# CV on R² and RMSE
r2_scores      = cross_val_score(pipeline, X, y, cv=kf, scoring="r2")
neg_mse_scores = cross_val_score(pipeline, X, y, cv=kf, scoring="neg_mean_squared_error")
rmse_scores    = np.sqrt(-neg_mse_scores)

print("R² per fold:      ", r2_scores)
print("Mean R²:           ", f"{r2_scores.mean():.4f} ± {r2_scores.std():.4f}")
print("RMSE per fold:     ", rmse_scores)
print("Mean RMSE:         ", f"{rmse_scores.mean():.2f} ± {rmse_scores.std():.2f}")

R² per fold:       [ 0.05015095  0.1186294   0.09714017  0.06892772 -0.27228127]
Mean R²:            0.0125 ± 0.1443
RMSE per fold:      [10824.93609151 13944.87828144 10676.41969689 16841.46513121
  7321.58466251]
Mean RMSE:          11921.86 ± 3231.47
