In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import joblib

from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Dropout, Flatten,
                                     Dense, LSTM, MultiHeadAttention, Concatenate, Reshape)
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.saving import register_keras_serializable

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# =============================================================================
# --- Configuration ---
# =============================================================================
# DATASET = "MPOWER_DATASET"
MODE = "ALL_VALIDS"
FEATURE_MODE = "ALL"
MODEL_NAME = "test" # MODIFIED: Model name updated
# ------------------------------------

# Path Setup
dataset = "UAMS"
FEATURES_FILE_PATH = os.path.join(os.getcwd(), dataset, "data", f"features_{MODE}_{FEATURE_MODE}.npz")
RESULTS_PATH = os.path.join(os.getcwd(), dataset, f"results_{MODE}_{FEATURE_MODE}")
MODEL_PATH = os.path.join(RESULTS_PATH, MODEL_NAME)
os.makedirs(MODEL_PATH, exist_ok=True)

HISTORY_SAVE_PATH = os.path.join(MODEL_PATH, "history.csv")
BEST_EXTRACTOR_PATH = os.path.join(MODEL_PATH, "best_feature_extractor.keras") # MODIFIED: Path name updated
KNN_MODEL_PATH = os.path.join(MODEL_PATH, "knn_classifier.joblib") # NEW: Path for k-NN model

# Hyperparameters
EPOCHS = 30
BATCH_SIZE = 32
LEARNING_RATE = 0.001
DROPOUT_RATE = 0.5
L2_STRENGTH = 0.01


In [4]:
# Import the correct Pipeline from imblearn
from imblearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.decomposition import PCA
import os
import joblib

def tune_and_evaluate_knn_pipeline(X_train, y_train, X_test, y_test, results_path, model_name, dim_reduction_method='pca_nca'):
    """
    Finds the best k-NN pipeline using specified dimensionality reduction methods.

    Parameters:
    - dim_reduction_method (str): 'pca', 'nca', 'pca_nca', or 'none' to specify the method.
    """
    print(f"\n--- Tuning and Evaluating k-NN Pipeline for: {model_name} ---")

    # Define the pipeline steps and parameter distributions
    steps = [
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42))
    ]
    param_dist = {
        'classifier__n_neighbors': [5, 7, 9, 11, 13],
        'classifier__weights': ['distance'],
        'classifier__metric': ['manhattan']
    }

    if dim_reduction_method == 'pca':
        steps.append(('pca', PCA(random_state=42)))
        param_dist['pca__n_components'] = [10, 20, 30, 40, 50]
    elif dim_reduction_method == 'nca':
        steps.append(('nca', NeighborhoodComponentsAnalysis(random_state=42, max_iter=200)))
        param_dist['nca__n_components'] = [10, 20, 30, 40, 50]
    elif dim_reduction_method == 'pca_nca':
        steps.append(('pca', PCA(random_state=42)))
        steps.append(('nca', NeighborhoodComponentsAnalysis(random_state=42, max_iter=200)))
        param_dist['pca__n_components'] = [50, 75, 100]
        param_dist['nca__n_components'] = [10, 20, 30]
    elif dim_reduction_method == 'none':
        # No dimensionality reduction step is added
        pass
    else:
        raise ValueError("Invalid dimensionality reduction method. Choose 'pca', 'nca', 'pca_nca', or 'none'.")

    # Add the final classifier to the pipeline
    steps.append(('classifier', KNeighborsClassifier()))

    # Use imblearn's Pipeline
    pipeline = Pipeline(steps)

    # Use 'roc_auc' as the scoring metric for RandomizedSearchCV
    search = RandomizedSearchCV(pipeline, param_dist, n_iter=15, cv=5, scoring='roc_auc', n_jobs=1, random_state=42, verbose=1)
    search.fit(X_train, y_train)

    print(f"\n--- Results for {model_name} ---")
    print(f"Best cross-validation AUC: {search.best_score_:.4f}")

    y_pred_proba = search.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Test Set AUC: {test_auc:.4f}")

    y_pred = search.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))

    pipeline_path = os.path.join(results_path, f"{model_name}_pipeline.joblib")
    joblib.dump(search.best_estimator_, pipeline_path)
    print(f"✅ Best k-NN pipeline saved to: {pipeline_path}")
    return test_auc
# Example usage to train and evaluate all three models
def run_all_models(X_train, y_train, X_test, y_test, results_path):
    # Model 1: k-NN with PCA
    tune_and_evaluate_knn_pipeline(X_train, y_train, X_test, y_test, results_path, 'PCA_Model', 'pca')

    # Model 2: k-NN with NCA
    tune_and_evaluate_knn_pipeline(X_train, y_train, X_test, y_test, results_path, 'NCA_Model', 'nca')

    # Model 3: k-NN with PCA then NCA
    tune_and_evaluate_knn_pipeline(X_train, y_train, X_test, y_test, results_path, 'PCA_NCA_Model', 'pca_nca')

    # Model 4: k-NN with no feature selection
    tune_and_evaluate_knn_pipeline(X_train, y_train, X_test, y_test, results_path, 'No_DR_Model', 'none')

In [None]:
import numpy as np
import os
from sklearn.model_selection import train_test_split

data_path = r"UAMS/data/features_ALL_VALIDS_ALL.npz"

# Load the data, which returns a dictionary-like object
data = np.load(data_path)

# Extract the labels (y)
y = data['labels']

# Get the list of feature names
array_names = list(data.keys())
feature_names = [name for name in array_names if name not in ['labels', 'sex', 'age']]

# Corrected: Flatten and concatenate each feature array
feature_arrays_2d = []
for name in feature_names:
    feature = data[name]
    # Check if the feature array is not already 2D
    if feature.ndim > 1:
        # Reshape to (n_samples, n_features), where n_features is the product of the original dimensions
        n_samples = feature.shape[0]
        feature_2d = feature.reshape(n_samples, -1)
    else:
        # If it's a 1D array, reshape it to (n_samples, 1)
        feature_2d = feature.reshape(-1, 1)
    feature_arrays_2d.append(feature_2d)

# Concatenate all reshaped 2D arrays along the feature axis (axis=1)
X = np.concatenate(feature_arrays_2d, axis=1)

# The rest of your code remains the same
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Assuming RESULTS_PATH and MODEL_NAME are defined elsewhere
results_path = os.path.join(RESULTS_PATH, MODEL_NAME)

# Call the function to run all models
run_all_models(X_train, y_train, X_test, y_test, results_path)


--- Tuning and Evaluating k-NN Pipeline for: PCA_Model ---
Fitting 5 folds for each of 15 candidates, totalling 75 fits
