In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
# Import data set
df = pd.read_csv('../data/artDataset_preprocessed.csv')

# Let's transform the data set into a numpy array
data_array = df.to_numpy()

# Predictors
X = data_array[:,1:]

# Target
y = data_array[:,0]

## **KNN Regression**

In [9]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import make_scorer, mean_absolute_error, mean_absolute_percentage_error

In [None]:
# 1. Define consistent CV strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=17)

# 2. Define pipeline and parameter grid
    # Range of k values to test
k_values = np.arange(1, 30, 2)  # odd numbers from 1 to 30 neighbors
# Test different weighting schemes
weight_options = ['uniform', 'distance']
# Different metrics
metric_options = ['euclidean', 'manhattan', 'chebyshev']

param_knn = {
    'knn__n_neighbors': k_values,
    'knn__weights': weight_options,
    'knn__metric': metric_options
}

# Pipeline for GridSearchCV
knn_pipeline = Pipeline([
    ("var_thresh", VarianceThreshold()),  # Remove low-variance features
    ("scaler", StandardScaler()),         # Standardize features
    ("knn", KNeighborsRegressor())        # KNN Regressor
])

# 3. GridSearchCV (find the best parameters) with established CV strategy
knn_cv = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=param_knn,
    scoring='neg_mean_squared_error',
    cv=cv_strategy
)
knn_cv.fit(X, y)

# Best parameters
best_k = knn_cv.best_params_['knn__n_neighbors']
best_weight = knn_cv.best_params_['knn__weights']
best_metric = knn_cv.best_params_['knn__metric']

# Generalization error (RMSE) from GridSearchCV
best_rmse_knn = np.sqrt(-knn_cv.best_score_)

print(f"Best n_neighbors: {best_k} (weights='{best_weight}', metric='{best_metric}')")
print(f"RMSE (CV): {best_rmse_knn:.4f}")

# 4. Evaluate best model separately using cross_val_score with same CV strategy
best_knn_model = knn_cv.best_estimator_

# Alternatively rebuild explicitly:
best_knn_model = Pipeline([
    ("var_thresh", VarianceThreshold()),
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor(n_neighbors=best_k, weights=best_weight))
])

# Obtain the MSE for each fold and compute averages
mse_folds = -cross_val_score(
    best_knn_model, X, y,
    cv=cv_strategy,
    scoring='neg_mean_squared_error'
)
mean_mse_knn = mse_folds.mean()

rmse_folds = np.sqrt(mse_folds)
mean_rmse_knn = np.sqrt(mean_mse_knn)

# Define MAE scorer
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Cross-validated MAE (negated back to positive)
mae_scores = -cross_val_score(
    best_knn_model, X, y,
    cv=cv_strategy,
    scoring=mae_scorer
)
mean_mae_knn = mae_scores.mean()

# Print detailed results
print("\nKNN Regression Cross-Validation Results:\n")

print(f"RMSE for each fold: {np.round(rmse_folds, 2)}")
print(f"Mean RMSE: {mean_rmse_knn:.4f} (±{rmse_folds.std():.4f})\n")

print(f"MSE for each fold: {np.round(mse_folds, 2)}")
print(f"Mean MSE: {mean_mse_knn:.4f} (±{mse_folds.std():.4f})\n")

print(f"MAE for each fold: {np.round(mae_scores, 2)}")
print(f"Mean MAE: {mean_mae_knn:.4f} (±{mae_scores.std():.4f})")

Best n_neighbors: 23 (weights='uniform', metric='manhattan')
RMSE (CV): 11888.6821

KNN Regression Cross-Validation Results:

RMSE for each fold: [12549.9  13864.6   9477.36  9505.09 13712.51]
Mean RMSE: 11982.7141 (±1956.6050)

MSE for each fold: [1.57500078e+08 1.92227213e+08 8.98203000e+07 9.03467240e+07
 1.88032875e+08]
Mean MSE: 143585437.8250 (±45299591.2056)

MAE for each fold: [5382.08 5131.74 4897.81 5666.3  5690.24]
Mean MAE: 5353.6341 (±306.2301)


However, there are some limitations with this KNN approach, with the curse of dimensionality being the most relevant one. In high-dimensional spaces, distances become less meaningful, and the KNN algorithm becomes less effective. Because of that, it might be pertinent to perform a dimensionaity reduction. PCA will be used to reduce the number of features and integrated into the gridsearch pipeline to find the optimal number of components.

## **KNN Regression: PCA**

In [14]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, make_scorer

In [19]:
# 1. Define consistent CV strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=17)

# 2. Define pipeline and parameter grid
k_values = np.arange(1, 30, 2)  # odd neighbors: 1 to 29
metric_options = ['euclidean', 'manhattan']
pca_components = [5, 10, 15, 20, 25]

param_knn_pca = {
    'knn__n_neighbors': k_values,
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': metric_options,
    'pca__n_components': pca_components
}

# Pipeline for GridSearchCV
knn_pca_pipeline = Pipeline([
    ("var_thresh", VarianceThreshold()),   # Remove low-variance features
    ("scaler", StandardScaler()),          # Standardize features
    ("pca", PCA()),                         # PCA for dimensionality reduction
    ("knn", KNeighborsRegressor())          # KNN Regressor
])

# 3. GridSearchCV (find the best parameters)
knn_pca_cv = GridSearchCV(
    estimator=knn_pca_pipeline,
    param_grid=param_knn_pca,
    scoring='neg_mean_squared_error',
    cv=cv_strategy,
    n_jobs=-1
)
knn_pca_cv.fit(X, y)

# Best parameters
best_params = knn_pca_cv.best_params_
best_k = best_params['knn__n_neighbors']
best_weight = best_params['knn__weights']
best_metric = best_params['knn__metric']
best_n_components = best_params['pca__n_components']

# Generalization error (RMSE) from GridSearchCV
best_rmse_knn_pca = np.sqrt(-knn_pca_cv.best_score_)

# Print results
print(f"Best parameters:")
print(f" - n_neighbors: {best_k}")
print(f" - weights: {best_weight}")
print(f" - metric: {best_metric}")
print(f" - PCA components: {best_n_components}")
print(f"RMSE (CV): {best_rmse_knn_pca:.2f}")

# 4. Evaluate best model separately using cross_val_score
best_knn_pca_model = knn_pca_cv.best_estimator_

mse_folds = -cross_val_score(
    best_knn_pca_model, X, y,
    cv=cv_strategy,
    scoring='neg_mean_squared_error'
)
mean_mse_knn_pca = mse_folds.mean()
rmse_folds = np.sqrt(mse_folds)
mean_rmse_knn_pca = np.sqrt(mean_mse_knn_pca)

# Define MAE scorer
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# MAE cross-validation
mae_scores = -cross_val_score(
    best_knn_pca_model, X, y,
    cv=cv_strategy,
    scoring=mae_scorer
)
mean_mae_knn_pca = mae_scores.mean()

# Print detailed results
print("\nKNN Regression with PCA Cross-Validation Results:\n")
print(f"RMSE for each fold: {np.round(rmse_folds, 2)}")
print(f"Mean RMSE: {mean_rmse_knn_pca:.2f} (±{rmse_folds.std():.2f})\n")

print(f"MSE for each fold: {np.round(mse_folds, 2)}")
print(f"Mean MSE: {mean_mse_knn_pca:.2f} (±{mse_folds.std():.2f})\n")

print(f"MAE for each fold: {np.round(mae_scores, 2)}")
print(f"Mean MAE: {mean_mae_knn_pca:.2f} (±{mae_scores.std():.2f})")


Best parameters:
 - n_neighbors: 7
 - weights: uniform
 - metric: euclidean
 - PCA components: 10
RMSE (CV): 11457.67

KNN Regression with PCA Cross-Validation Results:

RMSE for each fold: [11519.15 12594.38  9038.03  9381.01 13978.27]
Mean RMSE: 11457.67 (±1881.25)

MSE for each fold: [1.32690769e+08 1.58618370e+08 8.16860722e+07 8.80033434e+07
 1.95391971e+08]
Mean MSE: 131278105.23 (±42876912.70)

MAE for each fold: [5223.38 4296.16 4479.42 5049.04 5650.7 ]
Mean MAE: 4939.74 (±494.77)


A better performance can be observed when reducing dimensionality.