# KNN

Trying to train a supervised approach via k nearest neighbors

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def load_and_prepare_data(features_path, labels_path, participant_info_path):
    """
    Load features, labels and participant info
    """
    # Load your data
    X = np.load(features_path)
    y = np.load(labels_path)
    
    # Load participant info from CSV
    participant_df = pd.read_csv(participant_info_path)
    participant_ids = participant_df['participant_id'].values
    
    return X, y, participant_ids

In [None]:
def stratified_participant_split(X, y, participant_ids, test_size=0.2, random_state=42):
    """
    Split data ensuring each participant is represented in both train and test sets
    """
    X_train, X_test = [], []
    y_train, y_test = [], []
    
    # Split for each participant
    for participant in np.unique(participant_ids):
        mask = participant_ids == participant
        X_participant = X[mask]
        y_participant = y[mask]
        
        # Split participant's data while preserving class distribution
        X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
            X_participant, 
            y_participant,
            test_size=test_size,
            stratify=y_participant,
            random_state=random_state
        )
        
        X_train.append(X_train_p)
        X_test.append(X_test_p)
        y_train.append(y_train_p)
        y_test.append(y_test_p)
    
    # Combine all splits
    X_train = np.vstack(X_train)
    X_test = np.vstack(X_test)
    y_train = np.concatenate(y_train)
    y_test = np.concatenate(y_test)
    
    return X_train, X_test, y_train, y_test

In [None]:
def train_knn_with_cv(X, y, participant_ids, n_splits=5):
    """
    Train KNN with cross-validation
    """
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize KNN
    knn = KNeighborsClassifier()
    
    # Perform cross-validation
    # Note: We're using StratifiedKFold internally to maintain class distribution
    cv_scores = cross_val_score(knn, X_scaled, y, cv=n_splits)
    
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Average CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    return cv_scores

In [None]:
def tune_hyperparameters(X_train, y_train):
    """
    Perform grid search for best hyperparameters
    """
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)
    
    # Define parameter grid
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
    
    # Initialize KNN
    knn = KNeighborsClassifier()
    
    # Perform grid search
    grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_scaled, y_train)
    
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    
    return grid_search.best_params_

In [None]:
def evaluate_model(X_train, X_test, y_train, y_test, best_params):
    """
    Train final model with best parameters and evaluate
    """
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model with best parameters
    knn = KNeighborsClassifier(**best_params)
    knn.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = knn.predict(X_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.3f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return knn, accuracy

In [None]:

features_path = "path/to/your/features.npy"
labels_path = "path/to/your/labels.npy"
participant_info_path = "path/to/your/segment_info.csv"
    
# Load data
X, y, participant_ids = load_and_prepare_data(
    features_path, 
    labels_path, 
    participant_info_path
)
    
    # Split data
X_train, X_test, y_train, y_test = stratified_participant_split(
    X, y, participant_ids
)
    
    # Perform cross-validation
print("Performing cross-validation...")
cv_scores = train_knn_with_cv(X_train, y_train, participant_ids)
    
    # Tune hyperparameters
print("\nTuning hyperparameters...")
best_params = tune_hyperparameters(X_train, y_train)
    
    # Evaluate final model
print("\nEvaluating final model...")
final_model, accuracy = evaluate_model(
    X_train, X_test, y_train, y_test, best_params
)