In [29]:
from quantbullet.model.neighbors import FeatureScaledKNNRegressor
from sklearn.utils.estimator_checks import check_estimator
from sklearn.neighbors import KNeighborsRegressor

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
reg.get_params( deep=True )

{'feature_weights': None,
 'metrics': 'euclidean',
 'n_neighbors': 5,
 'weights': 'uniform'}

In [10]:
check_estimator(reg)

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Import the custom model
from quantbullet.model.neighbors import weightedDistanceKNRegressor

# --- Step 1: Generate synthetic data ---
np.random.seed(42)

# 3 feature dimensions with different scales and some correlation
n_samples = 5000
X = np.random.randn(n_samples, 3)
X[:, 1] = 0.5 * X[:, 0] + 0.5 * np.random.randn(n_samples)  # introduce correlation
X[:, 2] = 10 * X[:, 2]  # scale difference

# True function: nonlinear combination
y = 3 * np.sin(X[:, 0]) + 0.5 * X[:, 1] + 0.1 * X[:, 2] + 0.2 * np.random.randn(n_samples)

# --- Step 2: Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# --- Step 3: GridSearch to compare metrics and neighbors ---
param_grid = {
    'n_neighbors': [3, 5, 10, 20],
    'metrics': ['euclidean', 'mahalanobis'],
    'weights': ['uniform', 'distance'],
    'feature_weights': [None]  # Add more weight vectors here if desired
}

grid = GridSearchCV(
    weightedDistanceKNRegressor(),
    param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1
)

grid.fit(X_train, y_train)

# --- Step 4: Evaluate on test set ---
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print("\n✅ Best Params:", grid.best_params_)
print("📉 Best CV MSE:", -grid.best_score_)
print("🧪 Test MSE:", mean_squared_error(y_test, y_pred))


Fitting 5 folds for each of 16 candidates, totalling 80 fits

✅ Best Params: {'feature_weights': None, 'metrics': 'mahalanobis', 'n_neighbors': 10, 'weights': 'distance'}
📉 Best CV MSE: 0.07621344170829655
🧪 Test MSE: 0.06346710910491199


In [23]:
import pandas as pd

# Get full CV results
results_df = pd.DataFrame(grid.cv_results_)

# Select and rename key columns
summary_df = results_df[
    ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
].copy()

# Add unpacked parameter columns (for easier filtering/sorting)
params_df = pd.json_normalize(summary_df['params'])
summary_df = pd.concat([params_df, summary_df.drop(columns=['params'])], axis=1)

# Sort by best score (higher is better since it's neg MSE)
summary_df['mean_test_score'] = -summary_df['mean_test_score']  # convert to positive MSE
summary_df = summary_df.sort_values(by='mean_test_score')

# Display nicely
pd.set_option('display.max_colwidth', None)


In [24]:
summary_df

Unnamed: 0,feature_weights,metrics,n_neighbors,weights,mean_test_score,std_test_score,rank_test_score
13,,mahalanobis,10,distance,0.076213,0.007599,1
11,,mahalanobis,5,distance,0.077843,0.005622,2
3,,euclidean,5,distance,0.080406,0.006997,3
12,,mahalanobis,10,uniform,0.081739,0.00867,4
10,,mahalanobis,5,uniform,0.081976,0.006676,5
5,,euclidean,10,distance,0.082266,0.007166,6
15,,mahalanobis,20,distance,0.083748,0.01157,7
9,,mahalanobis,3,distance,0.084098,0.004743,8
2,,euclidean,5,uniform,0.085123,0.008316,9
1,,euclidean,3,distance,0.086199,0.005778,10


In [34]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

np.random.seed(42)

def make_mahalanobis_friendly_data(n_samples=500):
    X = np.random.randn(n_samples, 3)
    # Add strong correlation
    X[:, 1] = X[:, 0] * 0.9 + np.random.randn(n_samples) * 0.1
    X[:, 2] = X[:, 0] * -0.8 + np.random.randn(n_samples) * 0.2
    y = np.sin(X[:, 0]) + 0.1 * np.random.randn(n_samples)
    return X, y

def make_euclidean_friendly_data(n_samples=500):
    X = np.random.randn(n_samples, 3)
    y = 2 * X[:, 0] + 0.5 * X[:, 1] + 0.1 * X[:, 2] + 0.2 * np.random.randn(n_samples)
    return X, y

# Choose your dataset
# X, y = make_mahalanobis_friendly_data()
X, y = make_euclidean_friendly_data()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# GridSearch
param_grid = {
    'n_neighbors': [3, 5, 10, 20],
    'metrics': ['euclidean', 'mahalanobis'],
    'weights': ['uniform', 'distance'],
    'feature_weights': [None]
}

grid = GridSearchCV(
    weightedDistanceKNRegressor(),
    param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1
)

grid.fit(X_train, y_train)

# Result summary
results_df = pd.DataFrame(grid.cv_results_)
summary_df = pd.json_normalize(results_df['params']).assign(
    mean_test_mse=-results_df['mean_test_score'],
    std_test_mse=results_df['std_test_score'],
    rank=results_df['rank_test_score']
).sort_values(by='mean_test_mse')

print("\n📊 Grid Search Results:")
print(summary_df.to_string(index=False))

# Final test evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("\n✅ Best Params:", grid.best_params_)
print("📉 Test MSE:", mean_squared_error(y_test, y_pred))


Fitting 5 folds for each of 16 candidates, totalling 80 fits

📊 Grid Search Results:
feature_weights     metrics  n_neighbors  weights  mean_test_mse  std_test_mse  rank
           None   euclidean            3 distance       0.175532      0.032498     1
           None   euclidean            5 distance       0.177007      0.036711     2
           None mahalanobis            5 distance       0.184377      0.050798     3
           None mahalanobis            3 distance       0.184455      0.036285     4
           None   euclidean            3  uniform       0.194170      0.029669     5
           None   euclidean            5  uniform       0.197269      0.034193     6
           None mahalanobis            3  uniform       0.202558      0.037352     7
           None mahalanobis            5  uniform       0.205641      0.053098     8
           None   euclidean           10 distance       0.213274      0.050638     9
           None mahalanobis           10 distance       0.220298 

In [38]:
best_model.metrics

'euclidean'