# Lending Tree Loan Status: Upsampling
## Classification - K-Nearest Neighbors HPO GPU

## Set Up Environment, Read Data, Split Train/Test Sets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/RAPIDS/

/content/drive/MyDrive/RAPIDS


In [None]:
# Clone RAPIDS-Colab install files and see if GPU is compatible
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py

In [None]:
# Update the Colab environment and restart the kernel
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)

In [None]:
# Install CondaColab and restart the kernel
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:28
🔁 Restarting kernel...


In [None]:
# See if environment is ready to install RAPIDS
import condacolab
condacolab.check()

✨🍰✨ Everything looks OK!


In [None]:
# Install RAPIDS using the 'stable' release
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'

In [None]:
# Install/import dependencies
!pip install optuna
!pip install dask_optuna
import os
import warnings
import random
import numpy as np
import cupy as cp
from cupy import asnumpy
import dask
from dask.distributed import Client, wait
from dask.diagnostics import ProgressBar
from dask.utils import parse_bytes
from dask_cuda import LocalCUDACluster
import dask_cudf
import dask_optuna
import urllib.request
from contextlib import contextmanager
import time
from datetime import datetime, timedelta
from timeit import default_timer as timer
import cudf
import cuml
from cuml.neighbors import KNeighborsClassifier
import optuna
from optuna import Trial
optuna.logging.set_verbosity(optuna.logging.WARNING)
import joblib
import pickle
import pandas as pd
import sklearn
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report, confusion_matrix
import plotly.offline as py
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print('\n')
!/usr/local/cuda/bin/nvcc --version
!nvidia-smi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 5.1 MB/s 
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.3 MB/s 
[?25hCollecting sqlalchemy>=1.1.0
  Downloading SQLAlchemy-1.4.37-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 53.6 MB/s 
Collecting alembic
  Downloading alembic-1.8.0-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 37.7 MB/s 
Collecting greenlet!=0.4.17
  Downloading greenlet-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
# Define function to time code blocks
@contextmanager
def timed(name):
    t0 = time.time()
    yield
    t1 = time.time()
    print('..%-24s:  %8.4f' % (name, t1 - t0))

In [None]:
# Set up local CUDA cluster for Dask
cluster = LocalCUDACluster(threads_per_worker=1, ip='',
                           dashboard_address='8081')
c = Client(cluster)

# Query the client for all connected workers
workers = c.has_what().keys()
n_workers = len(workers)
c

0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://172.28.0.2:8081/status,

0,1
Dashboard: http://172.28.0.2:8081/status,Workers: 1
Total threads: 1,Total memory: 25.46 GiB
Status: running,Using processes: True

0,1
Comm: tcp://172.28.0.2:33785,Workers: 1
Dashboard: http://172.28.0.2:8081/status,Total threads: 1
Started: Just now,Total memory: 25.46 GiB

0,1
Comm: tcp://172.28.0.2:42393,Total threads: 1
Dashboard: http://172.28.0.2:38991/status,Memory: 25.46 GiB
Nanny: tcp://172.28.0.2:45779,
Local directory: /content/drive/MyDrive/LoanStatus/ML/RAPIDS/dask-worker-space/worker-41devz8j,Local directory: /content/drive/MyDrive/LoanStatus/ML/RAPIDS/dask-worker-space/worker-41devz8j
GPU: Tesla P100-PCIE-16GB,GPU memory: 15.90 GiB


In [None]:
# Set seed
seed_value = 42
os.environ['KNN_GPU'] = str(seed_value)
random.seed(seed_value)
cp.random.seed(seed_value)
np.random.seed(seed_value)

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Data/

/content/drive/MyDrive/LoanStatus/Data


In [None]:
# Read data
trainDF = cudf.read_csv('trainDF_US.csv', low_memory=False)
print('Train set: Number of rows and columns:', trainDF.shape)

testDF = cudf.read_csv('testDF_US.csv', low_memory=False)
print('Test set: Number of rows and columns:', testDF.shape)

# Set up features and target
X_train, y_train = trainDF.drop('loan_status',
                                axis=1), trainDF['loan_status'].astype('int32')
X_train = X_train.astype('float32')

X_test, y_test= testDF.drop('loan_status',
                            axis=1), testDF['loan_status'].astype('int32')
X_test = X_test.astype('float32')

Train set: Number of rows and columns: (3022132, 51)
Test set: Number of rows and columns: (432473, 51)


## Baseline Model

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Define model
knn = KNeighborsClassifier()

# Fit the model
knn.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNN_Baseline_US.pkl'

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(knn, file)

# Predict based on training
y_train_pred = knn.predict(X_train)
y_train_pred = y_train_pred.round(2)
y_train_pred = cupy.where(y_train_pred > 0.5, 1, 0)

y_test_pred = knn.predict(X_test)
y_test_pred = y_test_pred.round(2)
y_test_pred = cupy.where(y_test_pred > 0.5, 1, 0)

In [None]:
print('\nModel Metrics for KNN Baseline Upsampling')
print('Training Set')
print('Classification Report:')
clf_rpt = classification_report(asnumpy(y_train), asnumpy(y_train_pred))
print(clf_rpt)
print('\n')
print('Confusion matrix:')
print(confusion_matrix(asnumpy(y_train), asnumpy(y_train_pred)))
print('\n')
print('Accuracy score : %.3f' % accuracy_score(asnumpy(y_train),
                                               asnumpy(y_train_pred)))
print('Precision score : %.3f' % precision_score(asnumpy(y_train),
                                                 asnumpy(y_train_pred)))
print('Recall score : %.3f' % recall_score(asnumpy(y_train),
                                           asnumpy(y_train_pred)))
print('F1 score : %.3f' % f1_score(asnumpy(y_train), asnumpy(y_train_pred)))

print('\n')
print('Test Set')
print('Classification Report:')
clf_rpt = classification_report(asnumpy(y_test), asnumpy(y_test_pred))
print(clf_rpt)
print('\n')
print('Confusion matrix:')
print(confusion_matrix(asnumpy(y_test), asnumpy(y_test_pred)))
print('\n')
print('Accuracy score : %.3f' % accuracy_score(asnumpy(y_test),
                                               asnumpy(y_test_pred)))
print('Precision score : %.3f' % precision_score(asnumpy(y_test),
                                                 asnumpy(y_test_pred)))
print('Recall score : %.3f' % recall_score(asnumpy(y_test),
                                           asnumpy(y_test_pred)))
print('F1 score : %.3f' % f1_score(asnumpy(y_test), asnumpy(y_test_pred)))


Model Metrics for KNN Baseline Upsampling
Training Set
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95   1511066
           1       0.92      1.00      0.96   1511066

    accuracy                           0.96   3022132
   macro avg       0.96      0.96      0.95   3022132
weighted avg       0.96      0.96      0.95   3022132



Confusion matrix:
[[1378332  132734]
 [   3244 1507822]]


Accuracy score : 0.955
Precision score : 0.919
Recall score : 0.998
F1 score : 0.957


Test Set
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.87      0.91    377848
           1       0.45      0.73      0.55     54625

    accuracy                           0.85    432473
   macro avg       0.70      0.80      0.73    432473
weighted avg       0.89      0.85      0.87    432473



Confusion matrix:
[[328356  49492]
 [ 14638  39987]]


Accuracy score : 0.852
Precisi

## Weighted F1

### 100 Trials

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

# Define a name for the trials
study_name = 'dask_knn_optuna_US_100_weightedF1_tpe'

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Define function to train/evaluate the model
def train_and_eval(X_param, y_param, n_neighbors=10,
                   metric='euclidean', verbose=False):
    """
    Partition data into train/test sets, train and evaluate the model
    for the given parameters.

    Params
    ______

    X_param:  DataFrame.
              The data to use for training and testing.
    y_param:  Series.
              The label for training

    Returns
    score: F1 weighted of the fitted model
    """

    # Set up train/test sets
    X_train, y_train = trainDF.drop('loan_status',
                                    axis=1), trainDF['loan_status'].astype('int32')
    X_train = X_train.astype('float32')

    X_test, y_test= testDF.drop('loan_status',
                                axis=1), testDF['loan_status'].astype('int32')
    X_test = X_test.astype('float32')

    # Define model
    model = KNeighborsClassifier(n_neighbors=n_neighbors,
                                 metric=metric,
                                 verbose=verbose)

    # Start timer for each trial
    start = timer()

    # Fit model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    score = f1_score(y_test.to_numpy(), y_pred.to_numpy(), average='weighted')
    run_time = timer() - start
    print('- Trial time:', run_time)
    print('- Trial weighted F1:', score)
    print('######################################################')

    return score

In [None]:
print('Score with default parameters : ', train_and_eval(X_train, y_train))

- Trial time: 43.876738767000006
- Trial weighted F1: 0.8464943892460045
######################################################
Score with default parameters :  0.8464943892460045


In [None]:
# Define objective function with the parameters to be tested
def objective(trial, X_param, y_param):

    joblib.dump(study, 'KNN_Optuna_US_100_GPU_weightedF1.pkl')

    # Search parameters
    n_neighbors = trial.suggest_int('n_neighbors', 3, 50)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan',
                                                  'chebyshev', 'minkowski'])

    score = train_and_eval(X_param, y_param,
                           n_neighbors=n_neighbors,
                           verbose=False)

    return score

In [None]:
with timed('dask_optuna'):
    # Begin HPO trials
    # Start timer for experiment
    start_time = datetime.now()
    print('%-20s %s' % ('Start Time', start_time))
    if os.path.isfile('KNN_Optuna_US_100_GPU_weightedF1.pkl'):
      study = joblib.load('KNN_Optuna_US_100_GPU_weightedF1.pkl')
    else:
      study = optuna.create_study(sampler=optuna.samplers.TPESampler(),
                                  study_name=study_name,
                                  direction='maximize')

    # Optimize in parallel on Dask cluster
    with parallel_backend('dask'):
        study.optimize(lambda trial: objective(trial, X_train, y_train),
                       n_trials=100,
                       n_jobs=n_workers)

# End timer for experiment
end_time = datetime.now()
print('%-20s %s' % ('Start Time', start_time))
print('%-20s %s' % ('End Time', end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))
print('\n')
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Highest F1 Score', study.best_value)

Start Time           2022-06-15 03:22:54.837094
- Trial time: 36.73151022999991
- Trial weighted F1: 0.8374190755723063
######################################################
- Trial time: 37.97200269199993
- Trial weighted F1: 0.8428382202690887
######################################################
- Trial time: 37.7923227010001
- Trial weighted F1: 0.8424545370851392
######################################################
- Trial time: 37.05291739100039
- Trial weighted F1: 0.8515872317469013
######################################################
- Trial time: 37.84789861399986
- Trial weighted F1: 0.8504600270764096
######################################################
- Trial time: 38.02354808799964
- Trial weighted F1: 0.8428382202690887
######################################################
- Trial time: 36.547824082999796
- Trial weighted F1: 0.8467451488047179
######################################################
- Trial time: 38.166246279999996
- Trial weighted F1: 0.8424536

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions


In [None]:
# Output from HPO trials
trials_df = study.trials_dataframe()
trials_df.rename(columns={'number': 'iteration'}, inplace=True)
trials_df.rename(columns={'value': 'f1_weighted'}, inplace=True)
trials_df.rename(columns={'params_metric': 'metric'}, inplace=True)
trials_df.rename(columns={'params_n_neighbors': 'n_neighbors'}, inplace=True)

# Write processed data to csv
trials_df = trials_df.sort_values('f1_weighted', ascending=False)
trials_df.to_csv('KNN_Optuna_US_100_GPU_weightedF1.csv', index=False)
print(trials_df)

    iteration  f1_weighted             datetime_start  \
12         12     0.889078 2022-06-15 03:30:23.181521   
68         68     0.889078 2022-06-15 04:04:33.236612   
23         23     0.889078 2022-06-15 03:37:05.704699   
91         91     0.889078 2022-06-15 04:18:31.181151   
61         61     0.889078 2022-06-15 04:00:18.316469   
..        ...          ...                        ...   
49         49     0.832902 2022-06-15 03:52:58.054583   
90         90     0.828817 2022-06-15 04:17:54.663302   
44         44     0.828817 2022-06-15 03:49:54.446572   
75         75     0.828817 2022-06-15 04:08:47.797751   
65         65     0.828774 2022-06-15 04:02:43.640175   

            datetime_complete               duration     metric  n_neighbors  \
12 2022-06-15 03:30:59.490339 0 days 00:00:36.308818  chebyshev            4   
68 2022-06-15 04:05:09.507776 0 days 00:00:36.271164  chebyshev            4   
23 2022-06-15 03:37:41.938225 0 days 00:00:36.233526  chebyshev            

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations


In [None]:
# Plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
fig = optuna.visualization.plot_optimization_history(study)
py.plot(fig, filename='optimizationHistory_KNN_Optuna_US_100_GPU_weightedF1.html')
fig.show()

In [None]:
# plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
fig = optuna.visualization.plot_parallel_coordinate(study)
py.plot(fig, filename='parallelCoordinate_KNN_Optuna_US_100_GPU_weightedF1.html')
fig.show()

In [None]:
# plot_slice: shows the change of the hyperparamters space over the search.
fig = optuna.visualization.plot_slice(study)
py.plot(fig, filename='slice_KNN_Optuna_US_100_GPU_weightedF1.html')
fig.show()

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Re-create the best model and train on the training data
best_model = KNeighborsClassifier(n_neighbors=4, metric='chebyshev')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNN_Optuna_US_trials100_GPU_weightedF1.pkl'

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNN_Optuna_US_trials100_GPU_weightedF1.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNN HPO Upsampling 100trials GPU')
# Predict based on training
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print('\n')
print('Classification Report:')
clf_rpt = classification_report(y_test.to_numpy(), y_test_pred.to_numpy())
print(clf_rpt)
print('\n')
print('Confusion matrix:')
print(confusion_matrix(y_test.to_numpy(), y_test_pred.to_numpy()))
print('\n')
print('Accuracy score : %.3f' % accuracy_score(y_test.to_numpy(),
                                               y_test_pred.to_numpy()))
print('Precision score : %.3f' % precision_score(y_test.to_numpy(),
                                                 y_test_pred.to_numpy()))
print('Recall score : %.3f' % recall_score(y_test.to_numpy(),
                                           y_test_pred.to_numpy()))
print('F1 score : %.3f' % f1_score(y_test.to_numpy(), y_test_pred.to_numpy()))


Model Metrics for KNN HPO Upsampling 100trials GPU


Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.91      0.93    377848
           1       0.51      0.63      0.56     54625

    accuracy                           0.88    432473
   macro avg       0.73      0.77      0.74    432473
weighted avg       0.89      0.88      0.88    432473



Confusion matrix:
[[344469  33379]
 [ 20218  34407]]


Accuracy score : 0.876
Precision score : 0.508
Recall score : 0.630
F1 score : 0.562


In [None]:
# Evaluate on the testing data
print('The best model from Upsampling 100 GPU trials optimization scores {:.5f} AUC ROC on the test set.'.format(roc_auc_score(y_test.to_numpy(),
                                                                                                                               y_test_pred.to_numpy())))
print('This was achieved using these conditions:')
print(trials_df.iloc[0])

The best model from Upsampling 100 GPU trials optimization scores 0.77077 AUC ROC on the test set.
This was achieved using these conditions:
iteration                                    12
f1_weighted                            0.889078
datetime_start       2022-06-15 03:30:23.181521
datetime_complete    2022-06-15 03:30:59.490339
duration                 0 days 00:00:36.308818
metric                                chebyshev
n_neighbors                                   4
state                                  COMPLETE
Name: 12, dtype: object


## Weighted ROC

### 100 Trials

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

# Define a name for the trials
study_name = 'dask_knn_optuna_US_100_weightedROC_tpe'

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Define function to train/evaluate the model
def train_and_eval(X_param, y_param, n_neighbors=10,
                   metric='euclidean', verbose=False):
    """
    Partition data into train/test sets, train and evaluate the model
    for the given parameters.

    Params
    ______

    X_param:  DataFrame.
              The data to use for training and testing.
    y_param:  Series.
              The label for training

    Returns
    score: ROCweighted of the fitted model
    """

    # Set up train/test sets
    X_train, y_train = trainDF.drop('loan_status',
                                    axis=1), trainDF['loan_status'].astype('int32')
    X_train = X_train.astype('float32')

    X_test, y_test= testDF.drop('loan_status',
                                axis=1), testDF['loan_status'].astype('int32')
    X_test = X_test.astype('float32')

    # Define model
    model = KNeighborsClassifier(n_neighbors=n_neighbors,
                                 metric=metric,
                                 verbose=verbose)

    # Start timer for each trial
    start = timer()

    # Fit model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test.to_numpy(), y_pred.to_numpy(),
                          average='weighted')
    run_time = timer() - start
    print('- Trial time:', run_time)
    print('- Trial weighted ROC:', score)
    print('######################################################')

    return score

In [None]:
print('Score with default parameters : ', train_and_eval(X_train, y_train))

- Trial time: 40.58693484600008
- Trial weighted ROC: 0.7996328730398388
######################################################
Score with default parameters :  0.7996328730398388


In [None]:
# Define objective function with the parameters to be tested
def objective(trial, X_param, y_param):

    joblib.dump(study, 'KNN_Optuna_US_100_GPU_ROCweighted.pkl')

    # Search parameters
    n_neighbors = trial.suggest_int('n_neighbors', 3, 50)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan',
                                                  'chebyshev', 'minkowski'])

    score = train_and_eval(X_param, y_param,
                           n_neighbors=n_neighbors,
                           verbose=False)

    return score

In [None]:
with timed('dask_optuna'):
    # Begin HPO trials
    # Start timer for experiment
    start_time = datetime.now()
    print('%-20s %s' % ('Start Time', start_time))
    if os.path.isfile('KNN_Optuna_US_100_GPU_ROCweighted.pkl'):
      study = joblib.load('KNN_Optuna_US_100_GPU_ROCweighted.pkl')
    else:
      study = optuna.create_study(sampler=optuna.samplers.TPESampler(),
                                  study_name=study_name,
                                  direction='maximize')

    # Optimize in parallel on Dask cluster
    with parallel_backend('dask'):
        study.optimize(lambda trial: objective(trial, X_train, y_train),
                       n_trials=18,
                       n_jobs=n_workers)

# End timer for experiment
end_time = datetime.now()
print('%-20s %s' % ('Start Time', start_time))
print('%-20s %s' % ('End Time', end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))
print('\n')
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Highest ROCweighted score', study.best_value)

Start Time           2022-06-15 21:24:38.840344
- Trial time: 43.10649899299983
- Trial weighted ROC: 0.8210036468116899
######################################################
- Trial time: 38.12456896499998
- Trial weighted ROC: 0.8210896894018187
######################################################
- Trial time: 38.19675766099999
- Trial weighted ROC: 0.8233056268264642
######################################################
- Trial time: 38.054852979000316
- Trial weighted ROC: 0.8209905309349874
######################################################
- Trial time: 38.092378756000016
- Trial weighted ROC: 0.8210896894018187
######################################################
- Trial time: 38.073050967999734
- Trial weighted ROC: 0.8230114924471463
######################################################
- Trial time: 38.02229803099999
- Trial weighted ROC: 0.8229994440634949
######################################################
- Trial time: 38.00276728400013
- Trial weighted ROC:

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions


In [None]:
# Output from HPO trials
trials_df = study.trials_dataframe()
trials_df.rename(columns={'number': 'iteration'}, inplace=True)
trials_df.rename(columns={'value': 'roc_weighted'} , inplace=True)
trials_df.rename(columns={'params_metric': 'metric'}, inplace=True)
trials_df.rename(columns={'params_n_neighbors': 'n_neighbors'}, inplace=True)

# Write processed data to csv
trials_df = trials_df.sort_values('roc_weighted', ascending=False)
trials_df.to_csv('KNN_Optuna_US_100_GPU_ROCweighted.csv', index=False)
print(trials_df)

    iteration  roc_weighted             datetime_start  \
99         99      0.823306 2022-06-15 21:35:33.755352   
90         90      0.823306 2022-06-15 21:29:50.386339   
76         76      0.823306 2022-06-15 20:50:44.407581   
66         66      0.823306 2022-06-15 20:44:21.156543   
64         64      0.823306 2022-06-15 19:47:40.800688   
..        ...           ...                        ...   
0           0      0.799201 2022-06-15 19:05:53.254249   
47         47      0.794561 2022-06-15 19:35:31.565535   
62         62           NaN 2022-06-15 19:45:01.862845   
65         65           NaN 2022-06-15 19:48:19.086273   
81         81           NaN 2022-06-15 20:53:55.354468   

            datetime_complete               duration     metric  n_neighbors  \
99 2022-06-15 21:36:12.007554 0 days 00:00:38.252202  chebyshev         50.0   
90 2022-06-15 21:30:28.704243 0 days 00:00:38.317904  euclidean         50.0   
76 2022-06-15 20:51:22.664706 0 days 00:00:38.257125  manhattan

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations


In [None]:
# Plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
fig = optuna.visualization.plot_optimization_history(study)
py.plot(fig, filename='optimizationHistory_KNN_Optuna_US_100_GPU_ROCweighted.html')
fig.show()

In [None]:
# plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
fig = optuna.visualization.plot_parallel_coordinate(study)
py.plot(fig, filename='parallelCoordinate_KNN_Optuna_US_100_GPU_ROCweighted.html')
fig.show()

In [None]:
# plot_slice: shows the change of the hyperparamters space over the search.
fig = optuna.visualization.plot_slice(study)
py.plot(fig, filename='slice_KNN_Optuna_US_100_GPU_ROCweighted.html')
fig.show()

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations


In [None]:
# Visualize parameter importances
fig = optuna.visualization.plot_param_importances(study)
py.plot(fig, filename='paramImportance_KNN_Optuna_US_100_GPU_ROCweighted.html')
fig.show()

In [None]:
# Visualize empirical distribution function
fig = optuna.visualization.plot_edf(study)
py.plot(fig, filename='edf_KNN_Optuna_US_100_GPU_ROCweighted.html')
fig.show()

In [None]:
# Arrange best parameters to fit model for model metrics
params = study.best_params
params

{'metric': 'euclidean', 'n_neighbors': 50}

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Re-create the best model and train on the training data
best_model = KNeighborsClassifier(n_neighbors=50, metric='euclidean')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNN_Optuna_US_trials100_GPU_ROCweighted.pkl'

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNN_Optuna_US_trials100_GPU_ROCweighted.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNN HPO Upsampling 100trials GPU ROCweighted')
# Predict based on training
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print('\n')
print('Classification Report:')
clf_rpt = classification_report(y_test.to_numpy(), y_test_pred.to_numpy())
print(clf_rpt)
print('\n')
print('Confusion matrix:')
print(confusion_matrix(y_test.to_numpy(), y_test_pred.to_numpy()))
print('\n')
print('Accuracy score : %.3f' % accuracy_score(y_test.to_numpy(),
                                               y_test_pred.to_numpy()))
print('Precision score : %.3f' % precision_score(y_test.to_numpy(),
                                                 y_test_pred.to_numpy()))
print('Recall score : %.3f' % recall_score(y_test.to_numpy(),
                                           y_test_pred.to_numpy()))
print('F1 score : %.3f' % f1_score(y_test.to_numpy(), y_test_pred.to_numpy()))


Model Metrics for KNN HPO Upsampling 100trials GPU ROCweighted


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.83      0.89    377848
           1       0.41      0.82      0.54     54625

    accuracy                           0.83    432473
   macro avg       0.69      0.82      0.72    432473
weighted avg       0.90      0.83      0.85    432473



Confusion matrix:
[[313077  64771]
 [  9940  44685]]


Accuracy score : 0.827
Precision score : 0.408
Recall score : 0.818
F1 score : 0.545


In [None]:
# Evaluate on the testing data
print('The best model from Upsampling 100 ROCweighted GPU trials optimization scores {:.5f} AUC ROC on the test set.'.format(roc_auc_score(y_test.to_numpy(),
                                                                                                                                           y_test_pred.to_numpy())))
print('This was achieved using these conditions:')
print(trials_df.iloc[0])

The best model from Upsampling 100 ROCweighted GPU trials optimization scores 0.82331 AUC ROC on the test set.
This was achieved using these conditions:
iteration                                    68
recall                                 0.825007
datetime_start       2022-06-15 22:19:42.657405
datetime_complete    2022-06-15 22:20:20.874263
duration                 0 days 00:00:38.216858
metric                                minkowski
n_neighbors                                  49
state                                  COMPLETE
Name: 68, dtype: object


## Recall

### 100 Trials

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

# Define a name for the trials
study_name = 'dask_knn_optuna_US_100_Recall_tpe'

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Define function to train/evaluate the model
def train_and_eval(X_param, y_param, n_neighbors=10,
                   metric='euclidean', verbose=False):
    """
    Partition data into train/test sets, train and evaluate the model
    for the given parameters.

    Params
    ______

    X_param:  DataFrame.
              The data to use for training and testing.
    y_param:  Series.
              The label for training

    Returns
    score: Recall of the fitted model
    """

    # Set up train/test sets
    X_train, y_train = trainDF.drop('loan_status',
                                    axis=1), trainDF['loan_status'].astype('int32')
    X_train = X_train.astype('float32')

    X_test, y_test= testDF.drop('loan_status',
                                axis=1), testDF['loan_status'].astype('int32')
    X_test = X_test.astype('float32')

    # Define model
    model = KNeighborsClassifier(n_neighbors=n_neighbors,
                                 metric=metric,
                                 verbose=verbose)

    # Start timer for each trial
    start = timer()

    # Fit model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    score = recall_score(y_test.to_numpy(), y_pred.to_numpy())
    run_time = timer() - start
    print('- Trial time:', run_time)
    print('- Trial recall score:', score)
    print('######################################################')

    return score

In [None]:
print('Score with default parameters : ', train_and_eval(X_train, y_train))

- Trial time: 36.36414958399973
- Trial recall score: 0.7653272311212814
######################################################
Score with default parameters :  0.7653272311212814


In [None]:
# Define objective function with the parameters to be tested
def objective(trial, X_param, y_param):

    joblib.dump(study, 'KNN_Optuna_US_100_GPU_Recall.pkl')

    # Search parameters
    n_neighbors = trial.suggest_int('n_neighbors', 3, 50)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan',
                                                  'chebyshev', 'minkowski'])

    score = train_and_eval(X_param, y_param,
                           n_neighbors=n_neighbors,
                           verbose=False)

    return score

In [None]:
with timed('dask_optuna'):
    # Begin HPO trials
    # Start timer for experiment
    start_time = datetime.now()
    print('%-20s %s' % ('Start Time', start_time))
    if os.path.isfile('KNN_Optuna_US_100_GPU_Recall.pkl'):
      study = joblib.load('KNN_Optuna_US_100_GPU_Recall.pkl')
    else:
      study = optuna.create_study(sampler=optuna.samplers.TPESampler(),
                                  study_name=study_name,
                                  direction='maximize')

    # Optimize in parallel on Dask cluster
    with parallel_backend('dask'):
        study.optimize(lambda trial: objective(trial, X_train, y_train),
                       n_trials=100,
                       n_jobs=n_workers)

# End timer for experiment
end_time = datetime.now()
print('%-20s %s' % ('Start Time', start_time))
print('%-20s %s' % ('End Time', end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))
print('\n')
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Highest Recall score', study.best_value)

Start Time           2022-06-15 21:36:51.770029
- Trial time: 36.831561962999785
- Trial recall score: 0.807908466819222
######################################################
- Trial time: 37.766916992000006
- Trial recall score: 0.8192951945080091
######################################################
- Trial time: 37.07688920700002
- Trial recall score: 0.8045766590389016
######################################################
- Trial time: 36.087122979000014
- Trial recall score: 0.6760091533180778
######################################################
- Trial time: 36.71398151299991
- Trial recall score: 0.8046315789473685
######################################################
- Trial time: 37.80939827800012
- Trial recall score: 0.8134736842105263
######################################################
- Trial time: 36.26083092199997
- Trial recall score: 0.7525308924485126
######################################################
- Trial time: 36.49055047799993
- Trial recall score: 

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions


In [None]:
# Output from HPO trials
trials_df = study.trials_dataframe()
trials_df.rename(columns={'number': 'iteration'}, inplace=True)
trials_df.rename(columns={'value': 'recall'} , inplace=True)
trials_df.rename(columns={'params_metric': 'metric'}, inplace=True)
trials_df.rename(columns={'params_n_neighbors': 'n_neighbors'}, inplace=True)

# Write processed data to csv
trials_df = trials_df.sort_values('recall', ascending=False)
trials_df.to_csv('KNN_Optuna_US_100_GPU_Recall.csv', index=False)
print(trials_df)

    iteration    recall             datetime_start          datetime_complete  \
68         68  0.825007 2022-06-15 22:19:42.657405 2022-06-15 22:20:20.874263   
71         71  0.825007 2022-06-15 22:21:36.213671 2022-06-15 22:22:14.454067   
76         76  0.825007 2022-06-15 22:24:47.466442 2022-06-15 22:25:25.785740   
75         75  0.825007 2022-06-15 22:24:09.250967 2022-06-15 22:24:47.466222   
74         74  0.825007 2022-06-15 22:23:31.012075 2022-06-15 22:24:09.250773   
..        ...       ...                        ...                        ...   
7           7  0.776824 2022-06-15 21:41:11.596547 2022-06-15 21:41:48.316999   
60         60  0.765327 2022-06-15 22:14:38.744249 2022-06-15 22:15:15.249638   
6           6  0.752531 2022-06-15 21:40:35.168725 2022-06-15 21:41:11.596430   
45         45  0.732064 2022-06-15 22:05:11.967874 2022-06-15 22:05:48.265353   
3           3  0.676009 2022-06-15 21:38:44.007715 2022-06-15 21:39:20.245178   

                 duration  

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations


In [None]:
# Plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
fig = optuna.visualization.plot_optimization_history(study)
py.plot(fig, filename='optimizationHistory_KNN_Optuna_US_100_GPU_Recall.html')
fig.show()

In [None]:
# plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
fig = optuna.visualization.plot_parallel_coordinate(study)
py.plot(fig, filename='parallelCoordinate_KNN_Optuna_US_100_GPU_Recall.html')
fig.show()

In [None]:
# plot_slice: shows the change of the hyperparamters space over the search.
fig = optuna.visualization.plot_slice(study)
py.plot(fig, filename='slice_KNN_Optuna_US_100_GPU_Recall.html')
fig.show()

In [None]:
# Visualize parameter importances
fig = optuna.visualization.plot_param_importances(study)
py.plot(fig, filename='paramImportance_KNN_Optuna_US_100_GPU_Recall.html')
fig.show()

In [None]:
# Visualize empirical distribution function
fig = optuna.visualization.plot_edf(study)
py.plot(fig, filename='edf_KNN_Optuna_US_100_GPU_Recall.html')
fig.show()

In [None]:
# Arrange best parameters to fit model for model metrics
params = study.best_params
params

{'metric': 'minkowski', 'n_neighbors': 49}

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Re-create the best model and train on the training data
best_model = KNeighborsClassifier(n_neighbors=50, metric='minkowski')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNN_Optuna_US_trials100_GPU_Recall.pkl'

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNN_Optuna_US_trials100_GPU_Recall.pkl')
# print(model)
# ==============================================================================

In [None]:
print('\nModel Metrics for KNN HPO US 100trials GPU Recall')
# Predict based on training
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print('\n')
print('Classification Report:')
clf_rpt = classification_report(y_test.to_numpy(), y_test_pred.to_numpy())
print(clf_rpt)
print('\n')
print('Confusion matrix:')
print(confusion_matrix(y_test.to_numpy(), y_test_pred.to_numpy()))
print('\n')
print('Accuracy score : %.3f' % accuracy_score(y_test.to_numpy(),
                                               y_test_pred.to_numpy()))
print('Precision score : %.3f' % precision_score(y_test.to_numpy(),
                                                 y_test_pred.to_numpy()))
print('Recall score : %.3f' % recall_score(y_test.to_numpy(),
                                           y_test_pred.to_numpy()))
print('F1 score : %.3f' % f1_score(y_test.to_numpy(), y_test_pred.to_numpy()))


Model Metrics for KNN HPO US 100trials GPU Recall


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.83      0.89    377848
           1       0.41      0.82      0.54     54625

    accuracy                           0.83    432473
   macro avg       0.69      0.82      0.72    432473
weighted avg       0.90      0.83      0.85    432473



Confusion matrix:
[[313077  64771]
 [  9940  44685]]


Accuracy score : 0.827
Precision score : 0.408
Recall score : 0.818
F1 score : 0.545


In [None]:
# Evaluate on the testing data
print('The best model from US 100 Recall GPU trials optimization scores {:.5f} AUC ROC on the test set.'.format(roc_auc_score(y_test.to_numpy(),
                                                                                                                              y_test_pred.to_numpy())))
print('This was achieved using these conditions:')
print(trials_df.iloc[0])

The best model from US 100 Recall GPU trials optimization scores 0.82331 AUC ROC on the test set.
This was achieved using these conditions:
iteration                                    68
recall                                 0.825007
datetime_start       2022-06-15 22:19:42.657405
datetime_complete    2022-06-15 22:20:20.874263
duration                 0 days 00:00:38.216858
metric                                minkowski
n_neighbors                                  49
state                                  COMPLETE
Name: 68, dtype: object


## Precision

### 100 Trials

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

# Define a name for the trials
study_name = 'dask_knn_optuna_US_100_Precision_tpe'

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Define function to train/evaluate the model
def train_and_eval(X_param, y_param, n_neighbors=10,
                   metric='euclidean', verbose=False):
    """
    Partition data into train/test sets, train and evaluate the model
    for the given parameters.

    Params
    ______

    X_param:  DataFrame.
              The data to use for training and testing.
    y_param:  Series.
              The label for training

    Returns
    score: Precision of the fitted model
    """

    # Set up train/test sets
    X_train, y_train = trainDF.drop('loan_status',
                                    axis=1), trainDF['loan_status'].astype('int32')
    X_train = X_train.astype('float32')

    X_test, y_test= testDF.drop('loan_status',
                                axis=1), testDF['loan_status'].astype('int32')
    X_test = X_test.astype('float32')

    # Define model
    model = KNeighborsClassifier(n_neighbors=n_neighbors,
                                 metric=metric,
                                 verbose=verbose)

    # Start timer for each trial
    start = timer()

    # Fit model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    score = precision_score(y_test.to_numpy(), y_pred.to_numpy())
    run_time = timer() - start
    print('- Trial time:', run_time)
    print('- Trial precision score:', score)
    print('######################################################')

    return score

In [None]:
print('Score with default parameters : ', train_and_eval(X_train, y_train))

- Trial time: 36.337093423000624
- Trial precision score: 0.3998584436452674
######################################################
Score with default parameters :  0.3998584436452674


In [None]:
# Define objective function with the parameters to be tested
def objective(trial, X_param, y_param):

    joblib.dump(study, 'KNN_Optuna_US_100_GPU_Precision.pkl')

    # Search parameters
    n_neighbors = trial.suggest_int('n_neighbors', 3, 50)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan',
                                                  'chebyshev', 'minkowski'])

    score = train_and_eval(X_param, y_param,
                           n_neighbors=n_neighbors,
                           verbose=False)

    return score

In [None]:
with timed('dask_optuna'):
    # Begin HPO trials
    # Start timer for experiment
    start_time = datetime.now()
    print('%-20s %s' % ('Start Time', start_time))
    if os.path.isfile('KNN_Optuna_US_100_GPU_Precision.pkl'):
      study = joblib.load('KNN_Optuna_US_100_GPU_Precision.pkl')
    else:
      study = optuna.create_study(sampler=optuna.samplers.TPESampler(),
                                  study_name=study_name,
                                  direction='maximize')

    # Optimize in parallel on Dask cluster
    with parallel_backend('dask'):
        study.optimize(lambda trial: objective(trial, X_train, y_train),
                       n_trials=100,
                       n_jobs=n_workers)

# End timer for experiment
end_time = datetime.now()
print('%-20s %s' % ('Start Time', start_time))
print('%-20s %s' % ('End Time', end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))
print('\n')
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Highest Precision score', study.best_value)

Start Time           2022-06-16 00:04:13.980311
- Trial time: 36.338453387999834
- Trial precision score: 0.4022552419006272
######################################################
- Trial time: 36.364207304000956
- Trial precision score: 0.37684343745056337
######################################################
- Trial time: 36.11507237800106
- Trial precision score: 0.5250940673890288
######################################################
- Trial time: 36.182693877999554
- Trial precision score: 0.4468843592150552
######################################################
- Trial time: 36.868009486999654
- Trial precision score: 0.39305658226382495
######################################################
- Trial time: 36.39570619400001
- Trial precision score: 0.3675517475398711
######################################################
- Trial time: 36.26981502499984
- Trial precision score: 0.4022552419006272
######################################################
- Trial time: 36.350374858000

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/trialOptions


In [None]:
# Output from HPO trials
trials_df = study.trials_dataframe()
trials_df.rename(columns={'number': 'iteration'}, inplace=True)
trials_df.rename(columns={'value': 'precision'}, inplace=True)
trials_df.rename(columns={'params_metric': 'metric'}, inplace=True)
trials_df.rename(columns={'params_n_neighbors': 'n_neighbors'}, inplace=True)

# Write processed data to csv
trials_df = trials_df.sort_values('precision', ascending=False)
trials_df.to_csv('KNN_Optuna_US_100_GPU_Precision.csv', index=False)
print(trials_df)

    iteration  precision             datetime_start  \
49         49   0.529366 2022-06-15 23:10:41.997432   
40         40   0.529366 2022-06-15 23:05:14.182565   
94         94   0.529366 2022-06-16 00:28:37.931709   
92         92   0.529366 2022-06-16 00:27:25.093615   
90         90   0.529366 2022-06-16 00:26:11.935157   
..        ...        ...                        ...   
38         38   0.367740 2022-06-15 23:03:59.379737   
46         46   0.367552 2022-06-15 23:08:52.343453   
16         16   0.367552 2022-06-15 22:50:34.169544   
59         59   0.367552 2022-06-16 00:07:16.940574   
53         53        NaN 2022-06-15 23:13:08.178572   

            datetime_complete               duration     metric  n_neighbors  \
49 2022-06-15 23:11:18.324032 0 days 00:00:36.326600  euclidean          4.0   
40 2022-06-15 23:05:50.447242 0 days 00:00:36.264677  euclidean          4.0   
94 2022-06-16 00:29:14.289018 0 days 00:00:36.357309  euclidean          4.0   
92 2022-06-16 00:28

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_Explanations


In [None]:
# Plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
fig = optuna.visualization.plot_optimization_history(study)
py.plot(fig, filename='optimizationHistory_KNN_Optuna_US_100_GPU_Precision.html')
fig.show()

In [None]:
# plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
fig = optuna.visualization.plot_parallel_coordinate(study)
py.plot(fig, filename='parallelCoordinate_KNN_Optuna_US_100_GPU_Precision.html')
fig.show()

In [None]:
# plot_slice: shows the change of the hyperparamters space over the search.
fig = optuna.visualization.plot_slice(study)
py.plot(fig, filename='slice_KNN_Optuna_US_100_GPU_Precision.html')
fig.show()

In [None]:
# Visualize parameter importances
fig = optuna.visualization.plot_param_importances(study)
py.plot(fig, filename='paramImportance_KNN_Optuna_US_100_GPU_Precision.html')
fig.show()

In [None]:
# Visualize empirical distribution function
fig = optuna.visualization.plot_edf(study)
py.plot(fig, filename='edf_KNN_Optuna_US_100_GPU_Precision.html')
fig.show()

In [None]:
# Arrange best parameters to fit model for model metrics
params = study.best_params
params

{'metric': 'manhattan', 'n_neighbors': 4}

In [None]:
%cd /content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL/

/content/drive/MyDrive/LoanStatus/Python/Models/ML/KNN/Optuna/Model_PKL


In [None]:
# Re-create the best model and train on the training data
best_model = KNeighborsClassifier(n_neighbors=4, metric='manhattan')

# Fit the model
best_model.fit(X_train, y_train)

# Save model
Pkl_Filename = 'KNN_Optuna_US_trials100_GPU_Precision.pkl'

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(best_model, file)

# =============================================================================
# # To load saved model
# model = joblib.load('KNN_Optuna_US_trials100_GPU_Precision.pkl')
# print(model)
# =============================================================================

In [None]:
print('\nModel Metrics for KNN HPO US 100trials GPU Precision')
# Predict based on training
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
print('\n')
print('Classification Report:')
clf_rpt = classification_report(y_test.to_numpy(), y_test_pred.to_numpy())
print(clf_rpt)
print('\n')
print('Confusion matrix:')
print(confusion_matrix(y_test.to_numpy(), y_test_pred.to_numpy()))
print('\n')
print('Accuracy score : %.3f' % accuracy_score(y_test.to_numpy(),
                                               y_test_pred.to_numpy()))
print('Precision score : %.3f' % precision_score(y_test.to_numpy(),
                                                 y_test_pred.to_numpy()))
print('Recall score : %.3f' % recall_score(y_test.to_numpy(),
                                           y_test_pred.to_numpy()))
print('F1 score : %.3f' % f1_score(y_test.to_numpy(), y_test_pred.to_numpy()))


Model Metrics for KNN HPO US 100trials GPU Precision


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94    377848
           1       0.56      0.70      0.62     54625

    accuracy                           0.89    432473
   macro avg       0.76      0.81      0.78    432473
weighted avg       0.91      0.89      0.90    432473



Confusion matrix:
[[347439  30409]
 [ 16216  38409]]


Accuracy score : 0.892
Precision score : 0.558
Recall score : 0.703
F1 score : 0.622


In [None]:
# Evaluate on the testing data
print('The best model from US 100 Precision GPU trials optimization scores {:.5f} AUC ROC on the test set.'.format(roc_auc_score(y_test.to_numpy(),
                                                                                                                                 y_test_pred.to_numpy())))
print('This was achieved using these conditions:')
print(trials_df.iloc[0])

The best model from US 100 Precision GPU trials optimization scores 0.81133 AUC ROC on the test set.
This was achieved using these conditions:
iteration                                    49
precision                              0.529366
datetime_start       2022-06-15 23:10:41.997432
datetime_complete    2022-06-15 23:11:18.324032
duration                 0 days 00:00:36.326600
metric                                euclidean
n_neighbors                                 4.0
state                                  COMPLETE
Name: 49, dtype: object
