# Used Car Prices CarGurus: KNeighborsRegressor GPU
## Optuna Hyperparameter Optimization

## Set Up Environment, Read Data, Split Train/Test Sets

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/RAPIDS

/content/drive/MyDrive/RAPIDS


In [None]:
!pip install pynvml==11.4.1
# Clone RAPIDS-Colab install files and see if GPU is compatible
#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m***********************************************************************
Woo! Your instance has the right kind of GPU, a Tesla T4!
***********************************************************************



In [None]:
# Update the Colab environment and restart the kernel
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)

In [None]:
# Install CondaColab and restart the kernel 
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:30
🔁 Restarting kernel...


In [None]:
# See if environment is ready to install RAPIDS
import condacolab
condacolab.check()

✨🍰✨ Everything looks OK!


In [None]:
# Install RAPIDS using the 'stable' release
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'

In [None]:
# Install/import dependencies
!pip install optuna
!pip install dask_optuna
import os
import random
import numpy as np
import time
from contextlib import contextmanager
import cupy as cp
import cudf
import cuml
from cuml.model_selection import train_test_split
from cuml.neighbors import KNeighborsRegressor
import optuna
from optuna import Trial
import dask
import dask_cudf
import dask_optuna
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, wait, performance_report
import joblib
from joblib import parallel_backend
import pickle
from datetime import datetime, timedelta
from timeit import default_timer as timer
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import plotly.offline as py
import matplotlib.pyplot as plt
import seaborn as sns
optuna.logging.set_verbosity(optuna.logging.WARNING) 
my_dpi = 96

# Set seed 
seed_value = 42
os.environ['KNR_GPU'] = str(seed_value)
cp.random.seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# Define function to time code blocks
@contextmanager
def timed(name):
    t0 = time.time()
    yield
    t1 = time.time()
    print('..%-24s:  %8.4f' % (name, t1 - t0))

print('\n')
!/usr/local/cuda/bin/nvcc --version
!nvidia-smi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Sat Feb 25 04:49:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                

In [None]:
# Set up CUDA cluster
cluster = LocalCUDACluster(threads_per_worker=1, ip='', 
                           dashboard_address='8081') 
c = Client(cluster)

# Query the client for all connected workers
workers = c.has_what().keys()
n_workers = len(workers)
c


Mismatched versions found

+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| numpy   | 1.22.4 | 1.22.4    | 1.23.5  |
| tornado | 6.2    | 6.2       | 6.1     |
+---------+--------+-----------+---------+



0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://172.28.0.12:8081/status,

0,1
Dashboard: http://172.28.0.12:8081/status,Workers: 1
Total threads: 1,Total memory: 25.45 GiB
Status: running,Using processes: True

0,1
Comm: tcp://172.28.0.12:43963,Workers: 1
Dashboard: http://172.28.0.12:8081/status,Total threads: 1
Started: Just now,Total memory: 25.45 GiB

0,1
Comm: tcp://172.28.0.12:36191,Total threads: 1
Dashboard: http://172.28.0.12:44631/status,Memory: 25.45 GiB
Nanny: tcp://172.28.0.12:34243,
Local directory: /content/drive/MyDrive/RAPIDS/dask-worker-space/worker-bhyqvokh,Local directory: /content/drive/MyDrive/RAPIDS/dask-worker-space/worker-bhyqvokh
GPU: Tesla T4,GPU memory: 15.00 GiB


In [None]:
%cd /content/drive/MyDrive/UsedCarsCarGurus/Data/

/content/drive/MyDrive/UsedCarsCarGurus/Data


In [None]:
trainDF = cudf.read_csv('usedCars_trainSet.csv', low_memory=False)
print('Train set: Number of rows and columns:', trainDF.shape)

testDF  = cudf.read_csv('usedCars_trainSet.csv', low_memory=False)
print('Test set: Number of rows and columns:', testDF.shape)

X_train, y_train = trainDF.drop('price', 
                                axis=1), trainDF['price'].astype('int32')
X_test, y_test = testDF.drop('price', 
                             axis=1), testDF['price'].astype('int32')

X_train = cudf.get_dummies(X_train)
X_test = cudf.get_dummies(X_test)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

Train set: Number of rows and columns: (262329, 54)
Test set: Number of rows and columns: (262329, 54)


## 100 Trials

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL/

# Define a name for the trials
study_name = 'dask_optuna_knr_rmse_tpe'

[Errno 2] No such file or directory: '/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL/'
/content/drive/MyDrive/UsedCarsCarGurus/Data


In [None]:
# Define function to train/evaluate the model
def train_and_eval(X_param, y_param, n_neighbors=10, 
                   metric='euclidean', verbose=False): 
    """
        Partition data into train/test sets, train and evaluate the model
        for the given parameters.
        
        Params
        ______
        
        X_param:  DataFrame. 
                  The data to use for training and testing. 
        y_param:  Series. 
                  The label for training

        Returns
        score: RMSE of the fitted model
    """
    # Prepare the data for modeling
    X_train, y_train = trainDF.drop('price', 
                                axis=1), trainDF['price'].astype('int32')
    X_train = cudf.get_dummies(X_train)
    X_train = X_train.astype('float32')

    X_test, y_test = testDF.drop('price', 
                             axis=1), testDF['price'].astype('int32')
    X_test = cudf.get_dummies(X_test)
    X_test = X_test.astype('float32')
    
    # Define model
    model = KNeighborsRegressor(n_neighbors=n_neighbors, 
                                metric=metric,
                                verbose=verbose)
    
    # Start timer for each trial
    start = timer()

    # Fit model
    model.fit(X_train, y_train)
    run_time = timer() - start

    # Predict on the model
    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test.to_numpy(), y_pred.to_numpy(), 
                               squared=False)     
    print('- Trial RMSE:', score)
    
    return score

In [None]:
# Fit model with default parameters
print('Score with default parameters : ', train_and_eval(X_train, y_train))

- Trial RMSE: 4384.845258618256
Score with default parameters :  4384.845258618256


In [None]:
# Define objective function with the parameters to be tested
def objective(trial, X_param, y_param):
    
    joblib.dump(study, 'KNR_Optuna_100_GPU.pkl')
    
    # Search parameters
    n_neighbors = trial.suggest_int('n_neighbors', 3, 100) 
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 
                                                  'chebyshev', 'minkowski'])

    score = train_and_eval(X_param, y_param,
                           n_neighbors=n_neighbors, 
                           verbose=False)

    return score

In [None]:
with timed('dask_optuna'):
    # Begin HPO trials 
    # Start timer for experiment
    start_time = datetime.now()
    print('%-20s %s' % ('Start Time', start_time))
    if os.path.isfile('KNR_Optuna_100_GPU.pkl'): 
      study = joblib.load('KNR_Optuna_100_GPU.pkl')
    else: 
      study = optuna.create_study(sampler=optuna.samplers.TPESampler(), 
                                  study_name=study_name,
                                  direction='minimize')
      
    # Optimize in parallel on Dask cluster
    with parallel_backend('dask'):
        study.optimize(lambda trial: objective(trial, X_train, y_train), 
                       n_trials=100,
                       n_jobs=n_workers)

# End timer for experiment
end_time = datetime.now()
print('%-20s %s' % ('Start Time', start_time))
print('%-20s %s' % ('End Time', end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))
print('\n')
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Lowest RMSE', study.best_value)

Start Time           2023-02-25 04:52:30.332772
- Trial RMSE: 4121.887446545337
- Trial RMSE: 5095.414951314923
- Trial RMSE: 4923.542907063325
- Trial RMSE: 4835.219970698422
- Trial RMSE: 4723.126229647229
- Trial RMSE: 4603.356006636722
- Trial RMSE: 4916.2527198382495
- Trial RMSE: 5022.1737579728415
- Trial RMSE: 5037.345727529945
- Trial RMSE: 4774.027433856023
- Trial RMSE: 3586.325763235942
- Trial RMSE: 3586.325763235942
- Trial RMSE: 4121.887446545337
- Trial RMSE: 5164.754262546907
- Trial RMSE: 4005.9150305658145
- Trial RMSE: 4625.044663202976
- Trial RMSE: 4861.583489139057
- Trial RMSE: 4625.044663202976
- Trial RMSE: 3841.009957328154
- Trial RMSE: 4996.091784624338
- Trial RMSE: 5090.938813479511
- Trial RMSE: 3586.325763235942
- Trial RMSE: 4555.016986871412
- Trial RMSE: 4497.798438248811
- Trial RMSE: 4774.027433856023
- Trial RMSE: 4464.551264729626
- Trial RMSE: 4694.141815359486
- Trial RMSE: 3586.325763235942
- Trial RMSE: 4428.045308086619
- Trial RMSE: 4835.21

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/trialOptions/

/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/trialOptions


In [None]:
# Output from HPO trials
trials_df = study.trials_dataframe()
trials_df.rename(columns={'number': 'iteration'}, inplace=True)
trials_df.rename(columns={'value': 'rmse'}, inplace=True)
trials_df.rename(columns={'params_metric': 'metric'}, inplace=True)
trials_df.rename(columns={'params_n_neighbors': 'n_neighbors'}, inplace=True)
trials_df = trials_df.sort_values('rmse', ascending=True)
print(trials_df) 

# Write processed data to csv
trials_df.to_csv('KNR_Optuna_100_GPU.csv', index=False)

    iteration         rmse             datetime_start  \
61         61  3586.325763 2023-02-25 04:57:14.523899   
82         82  3586.325763 2023-02-25 04:58:54.395995   
41         41  3586.325763 2023-02-25 04:55:38.878378   
42         42  3586.325763 2023-02-25 04:55:43.655548   
27         27  3586.325763 2023-02-25 04:54:32.188775   
..        ...          ...                        ...   
1           1  5095.414951 2023-02-25 04:52:34.495580   
76         76  5131.276571 2023-02-25 04:58:25.412660   
47         47  5146.540508 2023-02-25 04:56:07.477462   
13         13  5164.754263 2023-02-25 04:53:27.862256   
36         36  5168.263875 2023-02-25 04:55:13.569818   

            datetime_complete               duration     metric  n_neighbors  \
61 2023-02-25 04:57:19.198371 0 days 00:00:04.674472  chebyshev            3   
82 2023-02-25 04:58:59.094776 0 days 00:00:04.698781  chebyshev            3   
41 2023-02-25 04:55:43.655324 0 days 00:00:04.776946  chebyshev            

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_Explanations/

/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_Explanations


In [None]:
# Plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
fig = optuna.visualization.plot_optimization_history(study)
py.plot(fig, filename='optimizationHistory_KNR_Optuna_100_GPU.pkl.html')
fig.show()

In [None]:
# plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
fig = optuna.visualization.plot_parallel_coordinate(study)
py.plot(fig, filename='parallelCoordinate_KNR_Optuna_100_GPU.pkl.html')
fig.show()

In [None]:
# plot_slice: shows the change of the hyperparamters space over the search. 
fig = optuna.visualization.plot_slice(study)
py.plot(fig, filename='slice_KNR_Optuna_100_GPU.pkl.html')
fig.show()

In [None]:
# Visualize empirical distribution function
fig = optuna.visualization.plot_edf(study)
py.plot(fig, filename='edf_KNR_Optuna_100_GPU.html')
fig.show()

In [None]:
# Arrange best parameters to fit model for model metrics
params = study.best_params   
params

{'n_neighbors': 3, 'metric': 'minkowski'}

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL/

/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL


In [None]:
# Re-create the best model and train on the training data
best_model = KNeighborsRegressor(n_neighbors=3, metric='minkowski')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNR_Optuna_trials100_GPU.pkl'  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNR_Optuna_trials100_GPU.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNR HPO 100 GPU trials')
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print('MAE train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_absolute_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('RMSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy(), 
                           squared=False),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy(), 
                           squared=False)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train.to_numpy(), y_train_pred.to_numpy()),
        r2_score(y_test.to_numpy(), y_test_pred.to_numpy())))


Model Metrics for KNR HPO 100 GPU trials
MAE train: 2602.515, test: 2602.515
MSE train: 12861732.480, test: 12861732.480
RMSE train: 3586.326, test: 3586.326
R^2 train: 0.860, test: 0.860


In [None]:
# Evaluate on the testing data 
print('The best model from optimization scores {:.5f} MSE on the test set.'.format(mean_squared_error(y_test.to_numpy(), 
                                                                                                      y_test_pred.to_numpy())))
print('This was achieved using these conditions:')
print(trials_df.iloc[0])

The best model from optimization scores 12861732.48005 MSE on the test set.
This was achieved using these conditions:
iteration                                    61
rmse                                3586.325763
datetime_start       2023-02-25 04:57:14.523899
datetime_complete    2023-02-25 04:57:19.198371
duration                 0 days 00:00:04.674472
metric                                chebyshev
n_neighbors                                   3
state                                  COMPLETE
Name: 61, dtype: object


## 1000 Trials

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL/

/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL


In [None]:
def objective(trial, X_param, y_param):
    
    joblib.dump(study, 'KNR_Optuna_1000_GPU.pkl')
    
    # Parameter searcg
    n_neighbors = trial.suggest_int('n_neighbors', 2, 100) 
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 
                                                  'chebyshev', 'minkowski'])

    score = train_and_eval(X_param, y_param,
                           n_neighbors=n_neighbors, 
                           verbose=False)

    return score

In [None]:
with timed('dask_optuna'):
    # Begin HPO trials 
    # Start timer for experiment
    start_time = datetime.now()
    print('%-20s %s' % ('Start Time', start_time))
    if os.path.isfile('KNR_Optuna_1000_GPU.pkl'): 
      study = joblib.load('KNR_Optuna_1000_GPU.pkl')
    else: 
      study = optuna.create_study(sampler=optuna.samplers.TPESampler(), 
                                  study_name=study_name,
                                  direction='minimize')
      
    # Optimize in parallel on your Dask cluster
    with parallel_backend('dask'):
        study.optimize(lambda trial: objective(trial, X_train, y_train), 
                       n_trials=1000,
                       n_jobs=n_workers)

# End timer for experiment
end_time = datetime.now()
print('%-20s %s' % ('Start Time', start_time))
print('%-20s %s' % ('End Time', end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))
print('\n')
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Lowest RMSE', study.best_value)

Start Time           2023-02-25 05:09:57.615216
- Trial RMSE: 4886.360554442493
- Trial RMSE: 5154.069020265029
- Trial RMSE: 4935.916210924576
- Trial RMSE: 5146.540508397751
- Trial RMSE: 4774.027433856023
- Trial RMSE: 4973.073511290455
- Trial RMSE: 4121.887446545337
- Trial RMSE: 4826.124233021871
- Trial RMSE: 5168.263875443831
- Trial RMSE: 4762.8956379763695
- Trial RMSE: 3070.8987010700253
- Trial RMSE: 3841.009957328154
- Trial RMSE: 3070.8987010700253
- Trial RMSE: 4603.356006636722
- Trial RMSE: 3070.8987010700253
- Trial RMSE: 5078.173747653229
- Trial RMSE: 4660.48534988204
- Trial RMSE: 5006.560109271813
- Trial RMSE: 4528.5810284713225
- Trial RMSE: 5037.345727529945
- Trial RMSE: 4528.5810284713225
- Trial RMSE: 3070.8987010700253
- Trial RMSE: 4384.845258618256
- Trial RMSE: 4709.442632738852
- Trial RMSE: 3586.325763235942
- Trial RMSE: 4853.149485004742
- Trial RMSE: 4428.045308086619
- Trial RMSE: 4603.356006636722
- Trial RMSE: 5082.400732729732
- Trial RMSE: 4916

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/trialOptions/

/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/trialOptions


In [None]:
# Output from HPO trials
trials_df = study.trials_dataframe()
trials_df.rename(columns={'number': 'iteration'}, inplace=True)
trials_df.rename(columns={'value': 'rmse'}, inplace=True)
trials_df.rename(columns={'params_metric': 'metric'}, inplace=True)
trials_df.rename(columns={'params_n_neighbors': 'n_neighbors'}, inplace=True)
trials_df = trials_df.sort_values('rmse', ascending=True)
print(trials_df) 

# Write processed data to csv
trials_df.to_csv('KNR_Optuna_1000_GPU.csv', index=False)

     iteration         rmse             datetime_start  \
706        706  3070.898701 2023-02-25 06:06:16.039692   
654        654  3070.898701 2023-02-25 06:02:04.757080   
657        657  3070.898701 2023-02-25 06:02:19.218560   
264        264  3070.898701 2023-02-25 05:30:46.428542   
660        660  3070.898701 2023-02-25 06:02:33.634623   
..         ...          ...                        ...   
8            8  5168.263875 2023-02-25 05:10:34.630532   
715        715  5168.263875 2023-02-25 06:06:59.793529   
109        109  5175.166948 2023-02-25 05:18:30.367756   
397        397  5175.166948 2023-02-25 05:41:21.148965   
963        963  5178.840228 2023-02-25 06:27:15.950894   

             datetime_complete               duration     metric  n_neighbors  \
706 2023-02-25 06:06:20.842922 0 days 00:00:04.803230  chebyshev            2   
654 2023-02-25 06:02:09.519697 0 days 00:00:04.762617  manhattan            2   
657 2023-02-25 06:02:24.009119 0 days 00:00:04.790559  cheby

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_Explanations/

/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_Explanations


In [None]:
# Plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
fig = optuna.visualization.plot_optimization_history(study)
py.plot(fig, filename='optimizationHistory_KNR_Optuna_1000_GPU.html')
fig.show()

In [None]:
# plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
fig = optuna.visualization.plot_parallel_coordinate(study)
py.plot(fig, filename='parallelCoordinate_KNR_Optuna_1000_GPU.html')
fig.show()

In [None]:
# plot_slice: shows the change of the hyperparamters space over the search. 
fig = optuna.visualization.plot_slice(study)
py.plot(fig, filename='slice_KNR_Optuna_1000_GPU.html')
fig.show()

In [None]:
# Visualize empirical distribution function
fig = optuna.visualization.plot_edf(study)
py.plot(fig, filename='edf_KNR_Optuna_1000_GPU.html')
fig.show()

In [None]:
# Arrange best parameters to fit model for model metrics
params = study.best_params   
params

{'n_neighbors': 2, 'metric': 'manhattan'}

In [None]:
# Write results to ML Results
%cd /content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL/

/content/drive/MyDrive/UsedCarsCarGurus/Models/ML/KNR/Optuna/Model_PKL


In [None]:
# Re-create the best model and train on the training data
best_model = KNeighborsRegressor(n_neighbors=2, metric='manhattan')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNR_Optuna_trials1000_GPU_man.pkl'  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNR_Optuna_trials1000_GPU_man.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNR HPO 1000 GPU trials - Manhattan')
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print('MAE train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_absolute_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('RMSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy(), 
                           squared=False),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy(), 
                           squared=False)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train.to_numpy(), y_train_pred.to_numpy()),
        r2_score(y_test.to_numpy(), y_test_pred.to_numpy())))


Model Metrics for KNR HPO 1000 GPU trials - Manhattan
MAE train: 1984.275, test: 1984.275
MSE train: 8089934.424, test: 8089934.424
RMSE train: 2844.281, test: 2844.281
R^2 train: 0.912, test: 0.912


In [None]:
# Evaluate on the testing data 
print('The best model from optimization scores {:.5f} MSE on the test set.'.format(mean_squared_error(y_test.to_numpy(), 
                                                                                                      y_test_pred.to_numpy())))
print('This was achieved using these conditions:')
print(trials_df.iloc[0])

The best model from optimization scores 9849685.25368 MSE on the test set.
This was achieved using these conditions:
iteration                                   706
rmse                                3070.898701
datetime_start       2023-02-25 06:06:16.039692
datetime_complete    2023-02-25 06:06:20.842922
duration                 0 days 00:00:04.803230
metric                                chebyshev
n_neighbors                                   2
state                                  COMPLETE
Name: 706, dtype: object


In [None]:
# Re-create the best model and train on the training data
# Tie for lowest RMSE
best_model = KNeighborsRegressor(n_neighbors=2, metric='minkowski')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNR_Optuna_trials1000_GPU_min.pkl'  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNR_Optuna_trials1000_GPU_min.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNR HPO 1000 GPU trials - Minkowski')
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print('MAE train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_absolute_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('RMSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy(), 
                           squared=False),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy(), 
                           squared=False)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train.to_numpy(), y_train_pred.to_numpy()),
        r2_score(y_test.to_numpy(), y_test_pred.to_numpy())))


Model Metrics for KNR HPO 1000 GPU trials - Minkowski
MAE train: 2180.592, test: 2180.592
MSE train: 9430418.832, test: 9430418.832
RMSE train: 3070.899, test: 3070.899
R^2 train: 0.897, test: 0.897


In [None]:
# Re-create the best model and train on the training data
# Test other distance
best_model = KNeighborsRegressor(n_neighbors=2, metric='euclidean')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNR_Optuna_trials1000_GPU_euc.pkl'  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNR_Optuna_trials1000_GPU_euc.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNR HPO 1000 GPU trials - Euclidean')
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print('MAE train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_absolute_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('RMSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy(), 
                           squared=False),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy(), 
                           squared=False)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train.to_numpy(), y_train_pred.to_numpy()),
        r2_score(y_test.to_numpy(), y_test_pred.to_numpy())))


Model Metrics for KNR HPO 1000 GPU trials - Euclidean
MAE train: 2180.592, test: 2180.592
MSE train: 9430418.832, test: 9430418.832
RMSE train: 3070.899, test: 3070.899
R^2 train: 0.897, test: 0.897


In [None]:
# Re-create the best model and train on the training data
# Test other distance
best_model = KNeighborsRegressor(n_neighbors=2, metric='chebyshev')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNR_Optuna_trials1000_GPU_cheb.pkl'  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNR_Optuna_trials1000_GPU_cheb.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNR HPO 1000 GPU trials - Chebyshev')
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print('MAE train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_absolute_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy()),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy())))
print('RMSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train.to_numpy(), y_train_pred.to_numpy(), 
                           squared=False),
        mean_squared_error(y_test.to_numpy(), y_test_pred.to_numpy(),
                           squared=False)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train.to_numpy(), y_train_pred.to_numpy()),
        r2_score(y_test.to_numpy(), y_test_pred.to_numpy())))


Model Metrics for KNR HPO 1000 GPU trials - Chebyshev
MAE train: 2231.101, test: 2231.101
MSE train: 9849685.254, test: 9849685.254
RMSE train: 3138.421, test: 3138.421
R^2 train: 0.892, test: 0.892
