In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from quantbullet.model.neighbors import FeatureScaledKNNRegressor
import numpy as np
import pandas as pd
from quantbullet.model_selection.optimizer import GridSearch

In [3]:
np.random.seed(42)

def make_mahalanobis_friendly_data(n_samples=500):
    X = np.random.randn(n_samples, 3)
    # Add strong correlation
    X[:, 1] = X[:, 0] * 0.9 + np.random.randn(n_samples) * 0.1
    X[:, 2] = X[:, 0] * -0.8 + np.random.randn(n_samples) * 0.2
    y = np.sin(X[:, 0]) + 0.1 * np.random.randn(n_samples)
    return X, y

def make_euclidean_friendly_data(n_samples=500):
    X = np.random.randn(n_samples, 3)
    y = 2 * X[:, 0] + 0.5 * X[:, 1] + 0.1 * X[:, 2] + 0.2 * np.random.randn(n_samples)
    return X, y

In [4]:
# Choose your dataset
# X, y = make_mahalanobis_friendly_data()
X, y = make_euclidean_friendly_data()

In [5]:
gridSearchCvOptimizer = GridSearch(
    estimator=FeatureScaledKNNRegressor(),
    param_grid={
        'n_neighbors': [1, 3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metrics': ['euclidean', 'mahalanobis'],
        'feature_weights': [None],
    },
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
)

In [6]:
gridSearchCvOptimizer.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


<quantbullet.model_selection.optimizer.GridSearch at 0x17829e6fd10>

In [7]:
gridSearchCvOptimizer.summary().head()

Unnamed: 0,feature_weights,metrics,n_neighbors,weights,mean_test_score,std_test_score,rank
3,,euclidean,3,distance,-0.157483,0.016937,1
15,,mahalanobis,5,distance,-0.16273,0.04366,2
13,,mahalanobis,3,distance,-0.163885,0.045984,3
5,,euclidean,5,distance,-0.16511,0.023466,4
17,,mahalanobis,7,distance,-0.170239,0.052808,5


In [10]:
gridSearchCvOptimizer.best_model().get_params()

{'feature_weights': None,
 'metrics': 'euclidean',
 'n_neighbors': 3,
 'weights': 'distance'}

In [12]:
best_model = gridSearchCvOptimizer.best_model()

In [11]:
gridSearchCvOptimizer.evaluate(X, y)

{'mse': 0.0}

In [None]:


# GridSearch
param_grid = {
    'n_neighbors': [3, 5, 10, 20],
    'metrics': ['euclidean', 'mahalanobis'],
    'weights': ['uniform', 'distance'],
    'feature_weights': [None]
}

grid = GridSearchCV(
    weightedDistanceKNRegressor(),
    param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1
)

grid.fit(X_train, y_train)

# Result summary
results_df = pd.DataFrame(grid.cv_results_)
summary_df = pd.json_normalize(results_df['params']).assign(
    mean_test_mse=-results_df['mean_test_score'],
    std_test_mse=results_df['std_test_score'],
    rank=results_df['rank_test_score']
).sort_values(by='mean_test_mse')

print("\n📊 Grid Search Results:")
print(summary_df.to_string(index=False))

# Final test evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("\n✅ Best Params:", grid.best_params_)
print("📉 Test MSE:", mean_squared_error(y_test, y_pred))


Fitting 5 folds for each of 16 candidates, totalling 80 fits

📊 Grid Search Results:
feature_weights     metrics  n_neighbors  weights  mean_test_mse  std_test_mse  rank
           None   euclidean            3 distance       0.175532      0.032498     1
           None   euclidean            5 distance       0.177007      0.036711     2
           None mahalanobis            5 distance       0.184377      0.050798     3
           None mahalanobis            3 distance       0.184455      0.036285     4
           None   euclidean            3  uniform       0.194170      0.029669     5
           None   euclidean            5  uniform       0.197269      0.034193     6
           None mahalanobis            3  uniform       0.202558      0.037352     7
           None mahalanobis            5  uniform       0.205641      0.053098     8
           None   euclidean           10 distance       0.213274      0.050638     9
           None mahalanobis           10 distance       0.220298 