# Imports

In [72]:
# Imports 
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import os
import sys

sys.path.append(os.path.abspath("../scripts"))
from data_loader import DataLoader

### Some additional metrics

In [73]:
from sklearn.metrics import make_scorer, confusion_matrix

# Custom function for specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

# Make it compatible with sklearn's scoring
specificity_scorer = make_scorer(specificity_score)

from sklearn.metrics import make_scorer, fbeta_score

# Create a custom F2 scorer
def f_score(beta: int):
    return make_scorer(fbeta_score, beta=beta)

def evaluate(y_val, y_val_pred):
    # Evaluate the model's performance
    accuracy = accuracy_score(y_val, y_val_pred)
    report = classification_report(y_val, y_val_pred)

    print(f"Validation Accuracy: {accuracy}")
    print("Classification Report:\n", report)
    print(f"Validation Accuracy: {accuracy:.2f}")

### Load data

In [74]:
# Load data
global X_train, y_train, X_val, y_val, X_test, y_test

data_loader = DataLoader()
X_train, y_train  = data_loader.training_data
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

# Initial exploration

## K-Neighbors-Classifier

In [75]:
# Initialize the KNeighborsClassifier with predefined parameters
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Use the classifier to make predictions on the validation data
y_val_pred = knn.predict(X_val)

# Evaluate the model's performance
evaluate(y_val, y_val_pred)

Validation Accuracy: 0.8276714975845411
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     21797
         1.0       0.42      0.24      0.30      4078

    accuracy                           0.83     25875
   macro avg       0.64      0.59      0.60     25875
weighted avg       0.80      0.83      0.81     25875

Validation Accuracy: 0.83


## Nearest Centroid

In [76]:
# Initialize the NearestCentroid classifier with predefined parameters
nc = NearestCentroid()

# Fit the classifier to the training data
nc.fit(X_train, y_train)

# Use the classifier to make predictions on the validation data
y_val_pred = nc.predict(X_val)

# Evaluate the model's performance
evaluate(y_val, y_val_pred)

Validation Accuracy: 0.6622608695652173
Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.65      0.77     21797
         1.0       0.28      0.71      0.40      4078

    accuracy                           0.66     25875
   macro avg       0.60      0.68      0.58     25875
weighted avg       0.82      0.66      0.71     25875

Validation Accuracy: 0.66


## Combine both algorithms

In [77]:
from sklearn.ensemble import VotingClassifier

# Initialize the KNeighborsClassifier and NearestCentroid
knn = KNeighborsClassifier(11, weights='uniform', metric='euclidean')
nc = NearestCentroid()

# Create a VotingClassifier with both classifiers
voting_clf = VotingClassifier(estimators=[('knn', knn), ('nc', nc)], voting='hard')

voting_clf.fit(X_train, y_train)

# Use the classifier to make predictions on the validation data
y_val_pred = voting_clf.predict(X_val)

# Evaluate the model's performance
evaluate(y_val, y_val_pred)


Validation Accuracy: 0.8405024154589372
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     21797
         1.0       0.48      0.18      0.26      4078

    accuracy                           0.84     25875
   macro avg       0.67      0.57      0.59     25875
weighted avg       0.80      0.84      0.81     25875

Validation Accuracy: 0.84


# Resampling Methods
We applied resampling methods to address the issue of class imbalance in our dataset because imbalanced data can lead to biased machine learning models that struggle to predict the minority class effectively. By balancing the class distribution, we aim to improve model performance and ensure fair representation. Specifically, we used the following methods:

- **Random Undersampling**: This method involves randomly removing samples from the majority class to balance the class distribution.

- **Random Oversampling**: This method involves randomly duplicating samples from the minority class to balance the class distribution.

- **SMOTE Oversampling**: Synthetic Minority Over-sampling Technique (SMOTE) generates synthetic samples for the minority class by interpolating between existing minority class samples.

- **SMOTE Tomek**: This method combines SMOTE oversampling with Tomek links, which are pairs of samples from different classes that are close to each other. By removing Tomek links after applying SMOTE, this method helps in cleaning the boundary between classes, leading to a more balanced and cleaner dataset.

## Random Undersampling

In [60]:
# test random undersampling
X_train_undersampling_random, y_train_undersampling_random = data_loader.training_data_undersampling_random
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_undersampling_random shape: {X_train_undersampling_random.shape}")
print(f"y_train_undersampling_random shape: {y_train_undersampling_random.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_undersampling_random shape: (55968, 21)
y_train_undersampling_random shape: (55968,)
X_val shape: (25875, 21)
y_val shape: (25875,)
X_test shape: (50229, 21)
y_test shape: (50229,)


In [61]:
# Initialize the KNeighborsClassifier with predefined parameters
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')

# Fit the classifier to the training data
knn.fit(X_train_undersampling_random, y_train_undersampling_random)

# Use the classifier to make predictions on the validation data
y_val_pred = knn.predict(X_val)

# Evaluate the model's performance
evaluate(y_val, y_val_pred)

Validation Accuracy: 0.735768115942029
Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.76      0.83     21797
         1.0       0.32      0.62      0.43      4078

    accuracy                           0.74     25875
   macro avg       0.62      0.69      0.63     25875
weighted avg       0.82      0.74      0.76     25875

Validation Accuracy: 0.74


## Random Oversampling

In [62]:
# test random oversampling
X_train_oversampling_random, y_train_oversampling_random = data_loader.training_data_oversampling_random
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_oversampling_random shape: {X_train_oversampling_random.shape}")
print(f"y_train_oversampling_random shape: {y_train_oversampling_random.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_oversampling_random shape: (299184, 21)
y_train_oversampling_random shape: (299184,)
X_val shape: (25875, 21)
y_val shape: (25875,)
X_test shape: (50229, 21)
y_test shape: (50229,)


In [63]:
# Initialize the KNeighborsClassifier with predefined parameters
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')

# Fit the classifier to the training data
knn.fit(X_train_oversampling_random, y_train_oversampling_random)

# Use the classifier to make predictions on the validation data
y_val_pred = knn.predict(X_val)

# Evaluate the model's performance
evaluate(y_val, y_val_pred)

Validation Accuracy: 0.7433429951690821
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.79      0.84     21797
         1.0       0.31      0.50      0.38      4078

    accuracy                           0.74     25875
   macro avg       0.60      0.64      0.61     25875
weighted avg       0.80      0.74      0.77     25875

Validation Accuracy: 0.74


## SMOTE Oversampling

In [64]:
# test smote oversampling
X_train_oversampling_smote, y_train_oversampling_smote = data_loader.training_data_oversampling_smote
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_oversampling_smote shape: {X_train_oversampling_smote.shape}")
print(f"y_train_oversampling_smote shape: {y_train_oversampling_smote.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_oversampling_smote shape: (299184, 21)
y_train_oversampling_smote shape: (299184,)
X_val shape: (25875, 21)
y_val shape: (25875,)
X_test shape: (50229, 21)
y_test shape: (50229,)


In [65]:
# Initialize the KNeighborsClassifier with predefined parameters
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')

# Fit the classifier to the training data
knn.fit(X_train_oversampling_smote, y_train_oversampling_smote)

# Use the classifier to make predictions on the validation data
y_val_pred = knn.predict(X_val)

# Evaluate the model's performance
evaluate(y_val, y_val_pred)

Validation Accuracy: 0.7086376811594203
Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.73      0.81     21797
         1.0       0.29      0.60      0.39      4078

    accuracy                           0.71     25875
   macro avg       0.60      0.66      0.60     25875
weighted avg       0.81      0.71      0.74     25875

Validation Accuracy: 0.71


## SMOTE Tomek

In [66]:
X_train_oversampling_smote_tomek, y_train_oversampling_smote_tomek = data_loader.training_data_resampling_smote_tomek
X_val, y_val = data_loader.validation_data
X_test, y_test = data_loader.test_data

print(f"X_train_oversampling_smote shape: {X_train_oversampling_smote_tomek.shape}")
print(f"y_train_oversampling_smote shape: {y_train_oversampling_smote_tomek.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_oversampling_smote shape: (298548, 21)
y_train_oversampling_smote shape: (298548,)
X_val shape: (25875, 21)
y_val shape: (25875,)
X_test shape: (50229, 21)
y_test shape: (50229,)


In [67]:
# Initialize the KNeighborsClassifier with predefined parameters
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')

# Fit the classifier to the training data
knn.fit(X_train_oversampling_smote_tomek, y_train_oversampling_smote_tomek)

# Use the classifier to make predictions on the validation data
y_val_pred = knn.predict(X_val)

# Evaluate the model's performance
evaluate(y_val, y_val_pred)

Validation Accuracy: 0.7084057971014492
Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.73      0.81     21797
         1.0       0.29      0.60      0.39      4078

    accuracy                           0.71     25875
   macro avg       0.60      0.66      0.60     25875
weighted avg       0.81      0.71      0.74     25875

Validation Accuracy: 0.71


# Hyperparameter Tuning and Cross Validation

## Halving Grid Search

In [80]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.decomposition import PCA
from sklearn.model_selection import HalvingGridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import numpy as np
import random


# Set the random state globally
np.random.seed(42)
random.seed(42)

def makeHalvingGridSearchCV(pipeline, param_grid, scoring):
    # Set up HalvingGridSearchCV
    halving_grid_search = HalvingGridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5,  # 5-fold cross-validation
        scoring=scoring,  # Custom F2 scoring metric
        n_jobs=-1,  # Use all processors
        verbose=1,  # To track progress
    )

    # Fit the halving grid search on training data
    halving_grid_search.fit(X_train, y_train)

    # Get the best parameters and score
    print("Best Parameters:", halving_grid_search.best_params_)
    print("Best Cross-Validation Recall:", halving_grid_search.best_score_, end="\n")

    # Use the best estimator to make predictions on the validation data
    y_val_pred = halving_grid_search.best_estimator_.predict(X_val)

    # Evaluate the model's performance
    evaluate(y_val, y_val_pred)

    return halving_grid_search

### K-Nearest-Neighbors

In [81]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)

param_grid = [
    {
        "classifier__n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

metric = "recall"

halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 243
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 800
n_resources: 243
Fitting 5 folds for each of 800 candidates, totalling 4000 fits
----------
iter: 1
n_candidates: 267
n_resources: 729
Fitting 5 folds for each of 267 candidates, totalling 1335 fits
----------
iter: 2
n_candidates: 89
n_resources: 2187
Fitting 5 folds for each of 89 candidates, totalling 445 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 3
n_candidates: 30
n_resources: 6561
Fitting 5 folds for each of 30 candidates, totalling 150 fits
----------
iter: 4
n_candidates: 10
n_resources: 19683
Fitting 5 folds for each of 10 candidates, totalling 50 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 5
n_candidates: 4
n_resources: 59049
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 6
n_candidates: 2
n_resources: 177147
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 21, 'classifier__weights': 'uniform', 'pca': PCA(), 'resampler': SMOTETomek()}
Best Cross-Validation Recall: 0.7530527510622459
Validation Accuracy: 0.6837874396135266
Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.67      0.78     21797
         1.0       0.30      0.74      0.42      4078

    accuracy                           0.68     25875
   macro avg       0.61      0.71      0.60     25875
weighted avg       0.83      0.68      0.73     25875

Validation Accuracy: 0.68


#### Save the best model

In [85]:
import joblib
from datetime import datetime

# Get the best model from the halving grid search
best_model = halving_grid_search.best_estimator_
best_params = halving_grid_search.best_params_

# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the best model to a file with a timestamp
model_filename = f'../models/knn/k={best_params["classifier__n_neighbors"]}_{timestamp}.pkl'
joblib.dump(best_model, model_filename)

print(f"Best model saved to '{model_filename}'")

Best model saved to '../models/knn/k=21_20241126_175547.pkl'


#### Try higher K's for KNN

In [86]:
param_grid = [
    {
        "classifier__n_neighbors": [17, 19, 21, 23, 25, 27],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)

metric = "recall"
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 480
n_resources: 730
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 160
n_resources: 2190
Fitting 5 folds for each of 160 candidates, totalling 800 fits
----------
iter: 2
n_candidates: 54
n_resources: 6570
Fitting 5 folds for each of 54 candidates, totalling 270 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 3
n_candidates: 18
n_resources: 19710
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 27, 'classifier__weights': 'uniform', 'pca': PCA(), 'resampler': SMOTETomek()}
Best Cross-Validation Recall: 0.7830293296648814
Validation Accuracy: 0.6810048309178744
Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.67      0.78     21797
         1.0       0.30      0.75      0.42      4078

    accuracy                           0.68     25875
   macro avg       0.62      0.71      0.60     25875
weighted avg       0.83      0.68      0.72     25875

Validation Accuracy: 0.68


In [87]:
param_grid = [
    {
        "classifier__n_neighbors": [27, 29, 31, 33, 35, 37],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)

metric = "recall"
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 480
n_resources: 730
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 160
n_resources: 2190
Fitting 5 folds for each of 160 candidates, totalling 800 fits
----------
iter: 2
n_candidates: 54
n_resources: 6570
Fitting 5 folds for each of 54 candidates, totalling 270 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 3
n_candidates: 18
n_resources: 19710
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 37, 'classifier__weights': 'uniform', 'pca': PCA(), 'resampler': SMOTETomek()}
Best Cross-Validation Recall: 0.7968655345620655
Validation Accuracy: 0.6776811594202898
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.66      0.78     21797
         1.0       0.30      0.77      0.43      4078

    accuracy                           0.68     25875
   macro avg       0.62      0.71      0.60     25875
weighted avg       0.84      0.68      0.72     25875

Validation Accuracy: 0.68


In [88]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)
param_grid = [
    {
        "classifier__n_neighbors": [37, 39, 41, 43, 45, 47],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, "recall")

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 480
n_resources: 730
Fitting 5 folds for each of 480 candidates, totalling 2400 fits
----------
iter: 1
n_candidates: 160
n_resources: 2190
Fitting 5 folds for each of 160 candidates, totalling 800 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 2
n_candidates: 54
n_resources: 6570
Fitting 5 folds for each of 54 candidates, totalling 270 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 3
n_candidates: 18
n_resources: 19710
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 47, 'classifier__weights': 'uniform', 'pca': None, 'resampler': SMOTE()}
Best Cross-Validation Recall: 0.8141575311827298
Validation Accuracy: 0.6770628019323671
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.66      0.77     21797
         1.0       0.30      0.77      0.43      4078

    accuracy                           0.68     25875
   macro avg       0.62      0.71      0.60     25875
weighted avg       0.84      0.68      0.72     25875

Validation Accuracy: 0.68


In [89]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)
param_grid = [
    {
        "classifier__n_neighbors": [3, 11, 21, 51, 100, 130],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

metric = "recall"
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 480
n_resources: 730
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 160
n_resources: 2190
Fitting 5 folds for each of 160 candidates, totalling 800 fits
----------
iter: 2
n_candidates: 54
n_resources: 6570
Fitting 5 folds for each of 54 candidates, totalling 270 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 3
n_candidates: 18
n_resources: 19710
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 130, 'classifier__weights': 'uniform', 'pca': PCA(), 'resampler': SMOTETomek()}
Best Cross-Validation Recall: 0.8369324629428206
Validation Accuracy: 0.6745893719806764
Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.65      0.77     21797
         1.0       0.30      0.80      0.44      4078

    accuracy                           0.67     25875
   macro avg       0.62      0.73      0.60     25875
weighted avg       0.84      0.67      0.72     25875

Validation Accuracy: 0.67


In [95]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)
param_grid = [
    {
        "classifier__n_neighbors": [50, 130, 250, 500],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

metric = "recall"
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 320
n_resources: 730
Fitting 5 folds for each of 320 candidates, totalling 1600 fits


Traceback (most recent call last):
  File "/Users/I551663/Github/ie500_data_mining_project/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I551663/Github/ie500_data_mining_project/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I551663/Github/ie500_data_mining_project/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/I551663/Github/ie500_data_mining_project/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_valu

----------
iter: 1
n_candidates: 107
n_resources: 2190
Fitting 5 folds for each of 107 candidates, totalling 535 fits


  _data = np.array(data, dtype=dtype, copy=copy,
 0.60783877 0.59627268 0.6474857  0.6536238  0.02727273 0.6386228
 0.70395626 0.74114811 0.7268624  0.01818182 0.63254017 0.66457591
 0.72055705 0.70574224 0.01818182 0.64468119 0.67539529 0.71905974
 0.72212879 0.01818182 0.60125841 0.61966824 0.59878126 0.66839679
 0.03468013 0.61935652 0.63755211 0.66860586 0.72557416 0.01818182
 0.65618135 0.66031592 0.74398915 0.76018952 0.         0.65049927
 0.65392175 0.72964796 0.71633046 0.         0.66957407 0.54107277
 0.67542869 0.69592558 0.         0.65205206 0.6437203  0.66815512
 0.7091876  0.         0.65205206 0.6474523  0.72184529 0.72542137
 0.         0.67490272 0.65756376 0.72991251 0.74029166 0.
 0.64057085 0.73434503 0.68160333 0.67853428 0.         0.67112687
 0.72720218 0.71509769 0.71380945 0.         0.647583   0.64235369
 0.73705537 0.73666011 0.         0.64979247        nan 0.74029166
 0.74588173 0.         0.6339905         nan 0.68381593 0.68483963
 0.         0.6540556 

----------
iter: 2
n_candidates: 36
n_resources: 6570
Fitting 5 folds for each of 36 candidates, totalling 180 fits


  _data = np.array(data, dtype=dtype, copy=copy,
 0.60783877 0.59627268 0.6474857  0.6536238  0.02727273 0.6386228
 0.70395626 0.74114811 0.7268624  0.01818182 0.63254017 0.66457591
 0.72055705 0.70574224 0.01818182 0.64468119 0.67539529 0.71905974
 0.72212879 0.01818182 0.60125841 0.61966824 0.59878126 0.66839679
 0.03468013 0.61935652 0.63755211 0.66860586 0.72557416 0.01818182
 0.65618135 0.66031592 0.74398915 0.76018952 0.         0.65049927
 0.65392175 0.72964796 0.71633046 0.         0.66957407 0.54107277
 0.67542869 0.69592558 0.         0.65205206 0.6437203  0.66815512
 0.7091876  0.         0.65205206 0.6474523  0.72184529 0.72542137
 0.         0.67490272 0.65756376 0.72991251 0.74029166 0.
 0.64057085 0.73434503 0.68160333 0.67853428 0.         0.67112687
 0.72720218 0.71509769 0.71380945 0.         0.647583   0.64235369
 0.73705537 0.73666011 0.         0.64979247        nan 0.74029166
 0.74588173 0.         0.6339905         nan 0.68381593 0.68483963
 0.         0.6540556 

----------
iter: 3
n_candidates: 12
n_resources: 19710
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  _data = np.array(data, dtype=dtype, copy=copy,
 0.60783877 0.59627268 0.6474857  0.6536238  0.02727273 0.6386228
 0.70395626 0.74114811 0.7268624  0.01818182 0.63254017 0.66457591
 0.72055705 0.70574224 0.01818182 0.64468119 0.67539529 0.71905974
 0.72212879 0.01818182 0.60125841 0.61966824 0.59878126 0.66839679
 0.03468013 0.61935652 0.63755211 0.66860586 0.72557416 0.01818182
 0.65618135 0.66031592 0.74398915 0.76018952 0.         0.65049927
 0.65392175 0.72964796 0.71633046 0.         0.66957407 0.54107277
 0.67542869 0.69592558 0.         0.65205206 0.6437203  0.66815512
 0.7091876  0.         0.65205206 0.6474523  0.72184529 0.72542137
 0.         0.67490272 0.65756376 0.72991251 0.74029166 0.
 0.64057085 0.73434503 0.68160333 0.67853428 0.         0.67112687
 0.72720218 0.71509769 0.71380945 0.         0.647583   0.64235369
 0.73705537 0.73666011 0.         0.64979247        nan 0.74029166
 0.74588173 0.         0.6339905         nan 0.68381593 0.68483963
 0.         0.6540556 

----------
iter: 4
n_candidates: 4
n_resources: 59130
Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _data = np.array(data, dtype=dtype, copy=copy,
 0.60783877 0.59627268 0.6474857  0.6536238  0.02727273 0.6386228
 0.70395626 0.74114811 0.7268624  0.01818182 0.63254017 0.66457591
 0.72055705 0.70574224 0.01818182 0.64468119 0.67539529 0.71905974
 0.72212879 0.01818182 0.60125841 0.61966824 0.59878126 0.66839679
 0.03468013 0.61935652 0.63755211 0.66860586 0.72557416 0.01818182
 0.65618135 0.66031592 0.74398915 0.76018952 0.         0.65049927
 0.65392175 0.72964796 0.71633046 0.         0.66957407 0.54107277
 0.67542869 0.69592558 0.         0.65205206 0.6437203  0.66815512
 0.7091876  0.         0.65205206 0.6474523  0.72184529 0.72542137
 0.         0.67490272 0.65756376 0.72991251 0.74029166 0.
 0.64057085 0.73434503 0.68160333 0.67853428 0.         0.67112687
 0.72720218 0.71509769 0.71380945 0.         0.647583   0.64235369
 0.73705537 0.73666011 0.         0.64979247        nan 0.74029166
 0.74588173 0.         0.6339905         nan 0.68381593 0.68483963
 0.         0.6540556 

Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 500, 'classifier__weights': 'uniform', 'pca': None, 'resampler': SMOTETomek()}
Best Cross-Validation Recall: 0.8416398235537598
Validation Accuracy: 0.66743961352657
Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.64      0.76     21797
         1.0       0.30      0.82      0.44      4078

    accuracy                           0.67     25875
   macro avg       0.62      0.73      0.60     25875
weighted avg       0.85      0.67      0.71     25875

Validation Accuracy: 0.67


#### Try other metrics

In [90]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)
param_grid = [
    {
        "classifier__n_neighbors": [3, 11, 21, 51, 100, 130],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, f_score(1))

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 480
n_resources: 730
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 160
n_resources: 2190
Fitting 5 folds for each of 160 candidates, totalling 800 fits
----------
iter: 2
n_candidates: 54
n_resources: 6570
Fitting 5 folds for each of 54 candidates, totalling 270 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 3
n_candidates: 18
n_resources: 19710
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits




Best Parameters: {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 100, 'classifier__weights': 'uniform', 'pca': None, 'resampler': RandomUnderSampler()}
Best Cross-Validation Recall: 0.46254540717786724
Validation Accuracy: 0.7308985507246377
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.73      0.82     21797
         1.0       0.34      0.73      0.46      4078

    accuracy                           0.73     25875
   macro avg       0.64      0.73      0.64     25875
weighted avg       0.84      0.73      0.76     25875

Validation Accuracy: 0.73


In [91]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)
param_grid = [
    {
        "classifier__n_neighbors": [3, 11, 21, 51, 100, 130],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, f_score(2))

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 480
n_resources: 730
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 160
n_resources: 2190
Fitting 5 folds for each of 160 candidates, totalling 800 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 2
n_candidates: 54
n_resources: 6570
Fitting 5 folds for each of 54 candidates, totalling 270 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 3
n_candidates: 18
n_resources: 19710
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 130, 'classifier__weights': 'uniform', 'pca': PCA(), 'resampler': SMOTETomek()}
Best Cross-Validation Recall: 0.6082066043316812
Validation Accuracy: 0.6753236714975845
Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.65      0.77     21797
         1.0       0.30      0.80      0.44      4078

    accuracy                           0.68     25875
   macro avg       0.62      0.73      0.61     25875
weighted avg       0.85      0.68      0.72     25875

Validation Accuracy: 0.68


In [92]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", KNeighborsClassifier()),  # Model
    ]
)
param_grid = [
    {
        "classifier__n_neighbors": [3, 11, 21, 51, 100, 130],
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__weights": ["uniform", "distance"],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, f_score(3))

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 480
n_resources: 730
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 160
n_resources: 2190
Fitting 5 folds for each of 160 candidates, totalling 800 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 2
n_candidates: 54
n_resources: 6570
Fitting 5 folds for each of 54 candidates, totalling 270 fits
----------
iter: 3
n_candidates: 18
n_resources: 19710
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 130, 'classifier__weights': 'uniform', 'pca': None, 'resampler': SMOTE()}
Best Cross-Validation Recall: 0.7016251239677722
Validation Accuracy: 0.6765990338164252
Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.65      0.77     21797
         1.0       0.30      0.80      0.44      4078

    accuracy                           0.68     25875
   macro avg       0.62      0.73      0.61     25875
weighted avg       0.84      0.68      0.72     25875

Validation Accuracy: 0.68


### Nearest Centroid

In [93]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", NearestCentroid()),  # Model
    ]
)
param_grid = [
    {
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__shrink_threshold": [None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

metric = "recall"
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 440
n_resources: 730
Fitting 5 folds for each of 440 candidates, totalling 2200 fits
----------
iter: 1
n_candidates: 147
n_resources: 2190
Fitting 5 folds for each of 147 candidates, totalling 735 fits
----------
iter: 2
n_candidates: 49
n_resources: 6570
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 3
n_candidates: 17
n_resources: 19710
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'classifier__metric': 'manhattan', 'classifier__shrink_threshold': 1.0, 'pca': None, 'resampler': SMOTETomek()}
Best Cross-Validation Recall: 0.7587585233059949
Validation Accuracy: 0.6898550724637681
Classification Report:
               precision   

#### Save the best model

In [94]:
import joblib
from datetime import datetime

# Get the best model from the halving grid search
best_model = halving_grid_search.best_estimator_

# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the best model to a file with a timestamp
model_filename = f'../models/nearest_centroid/lr_model_sampling_{timestamp}.pkl'
joblib.dump(best_model, model_filename)

print(f"Best model saved to '{model_filename}'")

Best model saved to '../models/nearest_centroid/lr_model_sampling_20241126_181731.pkl'


In [97]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", NearestCentroid()),  # Model
    ]
)
param_grid = [
    {
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__shrink_threshold": [None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

metric = f_score(1)
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 440
n_resources: 730
Fitting 5 folds for each of 440 candidates, totalling 2200 fits
----------
iter: 1
n_candidates: 147
n_resources: 2190
Fitting 5 folds for each of 147 candidates, totalling 735 fits
----------
iter: 2
n_candidates: 49
n_resources: 6570
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 3
n_candidates: 17
n_resources: 19710
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'classifier__metric': 'manhattan', 'classifier__shrink_threshold': 0.6, 'pca': None, 'resampler': SMOTE()}
Best Cross-Validation Recall: 0.4354720681992156
Validation Accuracy: 0.6892753623188406
Classification Report:
               precision    reca

In [None]:
pipeline = Pipeline(
    [
        ("resampler", None),  # Placeholder for resampling method
        ("pca", None),  # Placeholder for PCA
        ("classifier", NearestCentroid()),  # Model
    ]
)
param_grid = [
    {
        "classifier__metric": ["euclidean", "manhattan"],
        "classifier__shrink_threshold": [None, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "resampler": [None, RandomOverSampler(), RandomUnderSampler(), SMOTE(), SMOTETomek()],
        "pca": [None, PCA(n_components=5), PCA(n_components=10), PCA(n_components=None)],
    },
]

metric = f_score(2)
halving_grid_search = makeHalvingGridSearchCV(pipeline, param_grid, metric)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 730
max_resources_: 177576
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 440
n_resources: 730
Fitting 5 folds for each of 440 candidates, totalling 2200 fits
----------
iter: 1
n_candidates: 147
n_resources: 2190
Fitting 5 folds for each of 147 candidates, totalling 735 fits
----------
iter: 2
n_candidates: 49
n_resources: 6570
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 3
n_candidates: 17
n_resources: 19710
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 4
n_candidates: 6
n_resources: 59130
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'classifier__metric': 'manhattan', 'classifier__shrink_threshold': 0.8, 'pca': None, 'resampler': SMOTE()}
Best Cross-Validation Recall: 0.583831918134342
Validation Accuracy: 0.6898550724637681
Classification Report:
               precision    recal