# Full KNN Scenarios - Stress Level Detection

Implementasi lengkap tiga skenario KNN untuk klasifikasi tingkat stress:
1. **KNN Murni**: KNN dasar tanpa optimasi tambahan
2. **KNN + GridSearch**: KNN dengan optimasi hyperparameter
3. **SMOTE + GridSearch + KNN**: KNN dengan balancing data dan optimasi hyperparameter

## 📦 Library Installation dan Import

In [None]:
!pip install --upgrade imbalanced-learn
!pip install scikit-learn
!pip install pandas
!pip install matplotlib
!pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import time

print("✅ Library berhasil diimport")

## 📊 Data Loading dan Preprocessing

In [None]:
FILE_PATH = './dataset/fix dataset 1031.csv'

# Read CSV with semicolon as separator and handle mixed decimal separators
df = pd.read_csv(FILE_PATH, sep=';', decimal='.')
dataset = df.copy()

# Tampilkan Semua row pada kolom pertama yang memiliki nilai NaN
print("📊 DATASET INFORMATION:")
print("Jumlah baris yang memiliki nilai NaN pada kolom pertama:", dataset[dataset.columns[0]].isna().sum())

# Bersihkan data dengan menghapus baris yang memiliki nilai NaN pada kolom pertama
dataset = dataset.dropna(subset=[dataset.columns[0]])

print("Dataset shape:", dataset.shape)
display(dataset.head())

In [None]:
# Check for missing values
print("Missing values per column:")
nan_counts = dataset.isna().sum()
display(nan_counts[nan_counts > 0])

In [None]:
# Fill missing values in Sleep Disorder with 'Normal'
dataset['Sleep Disorder'] = dataset['Sleep Disorder'].fillna('Normal')

# Split Blood Pressure column
if 'Blood Pressure' in dataset.columns:
    dataset[['Systolic', 'Diastolic']] = dataset['Blood Pressure'].str.split('/', expand=True)
    dataset['Systolic'] = pd.to_numeric(dataset['Systolic'], errors='coerce')
    dataset['Diastolic'] = pd.to_numeric(dataset['Diastolic'], errors='coerce')
    dataset = dataset.drop('Blood Pressure', axis=1)

# Clean numeric columns
kolom_numerik = ["Sleep Duration", "Heart Rate", "Daily Steps", "Systolic", "Diastolic"]
for col in kolom_numerik:
    if col in dataset.columns:
        dataset[col] = dataset[col].apply(lambda x: str(x).replace(',', '.') if isinstance(x, str) else x)
        dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

print("✅ Data preprocessing selesai")
display(dataset.head())

## 🎯 Target Encoding dan Feature Selection

In [None]:
# Label encoding for target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(dataset['Sleep Disorder'])

print("Target classes:", label_encoder.classes_)
print("Encoded values:", np.unique(target_encoded))

# Show class distribution
print("\n=== DISTRIBUSI KELAS ORIGINAL ====")
class_counts = pd.Series(target_encoded).value_counts().sort_index()
for i, count in enumerate(class_counts):
    print(f"{label_encoder.classes_[i]}: {count} samples")

# Visualize class distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=dataset, x='Sleep Disorder')
plt.title('Distribusi Kelas Target (Original)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Select features
feature_columns = ["Gender", "Age", "Occupation", "Sleep Duration", "Quality of Sleep", 
                  "Physical Activity Level", "Stress Level", "BMI Category", "Systolic", "Diastolic"]

# Filter only existing columns
available_features = [col for col in feature_columns if col in dataset.columns]
features = dataset[available_features]

print("Selected features:", available_features)
print("Features shape:", features.shape)
display(features.head())

## 🔀 Data Splitting

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    features, target_encoded, test_size=0.2, random_state=42, stratify=target_encoded
)

print("=== DATA SPLIT ====")
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

print("\nDistribusi y_train:")
train_dist = pd.Series(y_train).value_counts().sort_index()
for i, count in enumerate(train_dist):
    print(f"{label_encoder.classes_[i]}: {count} samples")

print("\nDistribusi y_test:")
test_dist = pd.Series(y_test).value_counts().sort_index()
for i, count in enumerate(test_dist):
    print(f"{label_encoder.classes_[i]}: {count} samples")

## ⚙️ Pipeline Setup

In [None]:
# Define numerical and categorical features
numerical_features = [col for col in available_features if features[col].dtype in ['int64', 'float64']]
categorical_features = [col for col in available_features if features[col].dtype == 'object']

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

# Create preprocessors
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

print("✅ Preprocessor pipeline created")

# 🎯 SKENARIO 1: KNN MURNI

Implementasi KNN dasar tanpa optimasi tambahan untuk klasifikasi tingkat stress.

## 1.1. KNN Murni Training

In [None]:
# Create KNN pipeline with default parameters (k=5)
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier())
])

# Train the model
print("Training KNN model...")
knn_pipeline.fit(X_train, y_train)
print("✅ Model training completed")

## 1.2. KNN Murni Evaluation

In [None]:
# Make predictions
y_pred_murni = knn_pipeline.predict(X_test)

# Calculate accuracy
accuracy_murni = accuracy_score(y_test, y_pred_murni)

print("=== HASIL EVALUASI KNN MURNI ====")
print(f"Akurasi: {accuracy_murni:.4f} ({accuracy_murni*100:.2f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_murni, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix untuk KNN Murni
cm_murni = confusion_matrix(y_test, y_pred_murni)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_murni, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - KNN Murni")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

## 1.3. Testing Different K Values untuk KNN Murni

In [None]:
# Test different k values
k_values = range(1, 21)
accuracies_murni = []

print("Testing different k values untuk KNN Murni...")
for k in k_values:
    knn_pipeline.set_params(knn__n_neighbors=k)
    knn_pipeline.fit(X_train, y_train)
    y_pred_k = knn_pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred_k)
    accuracies_murni.append(acc)
    print(f"k={k}: Accuracy = {acc:.4f}")

# Find best k
best_k_murni = k_values[np.argmax(accuracies_murni)]
best_accuracy_murni = max(accuracies_murni)

print(f"\n=== BEST K VALUE untuk KNN MURNI ====")
print(f"Best k: {best_k_murni}")
print(f"Best accuracy: {best_accuracy_murni:.4f} ({best_accuracy_murni*100:.2f}%)")

In [None]:
# Plot accuracy vs k untuk KNN Murni
plt.figure(figsize=(10, 6))
plt.plot(k_values, accuracies_murni, marker='o', linewidth=2, markersize=6, color='blue')
plt.axvline(x=best_k_murni, color='red', linestyle='--', alpha=0.7, label=f'Best k={best_k_murni}')
plt.title('Accuracy vs. Number of Neighbors (k) - KNN Murni')
plt.xlabel('k (Number of Neighbors)')
plt.ylabel('Accuracy')
plt.grid(True, alpha=0.3)
plt.legend()
plt.xticks(k_values)
plt.tight_layout()
plt.show()

# 🔍 SKENARIO 2: KNN + GRIDSEARCH

Implementasi KNN dengan optimasi hyperparameter menggunakan GridSearchCV untuk klasifikasi tingkat stress.

## 2.1. Baseline Model (Before GridSearch)

In [None]:
# Create baseline KNN pipeline (default parameters)
baseline_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier())
])

# Train baseline model
print("Training baseline model...")
baseline_pipeline.fit(X_train, y_train)
baseline_pred = baseline_pipeline.predict(X_test)
baseline_accuracy = accuracy_score(y_test, baseline_pred)

print("=== BASELINE MODEL (BEFORE GridSearch) ====")
print("Default hyperparameters:")
knn_params = baseline_pipeline.named_steps['knn'].get_params()
important_params = ['n_neighbors', 'weights', 'algorithm', 'metric', 'p']
for param in important_params:
    if param in knn_params:
        print(f"  {param}: {knn_params[param]}")

print(f"\nBaseline accuracy: {baseline_accuracy:.4f} ({baseline_accuracy*100:.2f}%)")

## 2.2. GridSearchCV untuk Optimasi Hyperparameter

In [None]:
# Define parameter grid for GridSearch
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'knn__weights': ['uniform', 'distance'],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
    'knn__p': [1, 2]  # Only relevant for minkowski metric
}

print("Parameter grid for GridSearch:")
for param, values in param_grid.items():
    print(f"  {param}: {values}")

print(f"\nTotal combinations to test: {np.prod([len(v) for v in param_grid.values()])}")

In [None]:
# Create pipeline for GridSearch
grid_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier())
])

# Perform GridSearchCV
print("Starting GridSearchCV...")
print("This may take several minutes...")

start_time = time.time()

grid_search = GridSearchCV(
    grid_pipeline,
    param_grid,
    cv=5,  # 5-fold cross validation
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=1
)

grid_search.fit(X_train, y_train)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"\n✅ GridSearchCV completed in {elapsed_time:.2f} seconds")

## 2.3. GridSearch Results

In [None]:
# Display GridSearch results
print("=== GRIDSEARCH RESULTS ====")
print("\nBest parameters found:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest cross-validation score: {grid_search.best_score_:.4f} ({grid_search.best_score_*100:.2f}%)")

# Test the best model
best_model = grid_search.best_estimator_
y_pred_grid = best_model.predict(X_test)
grid_accuracy = accuracy_score(y_test, y_pred_grid)

print(f"Test set accuracy with best model: {grid_accuracy:.4f} ({grid_accuracy*100:.2f}%)")

In [None]:
# Compare Before vs After GridSearch
print("=== COMPARISON: BEFORE vs AFTER GridSearch ====")
print("\nBEFORE GridSearch (default parameters):")
print(f"  n_neighbors: 5")
print(f"  weights: uniform")
print(f"  algorithm: auto")
print(f"  metric: minkowski")
print(f"  p: 2")
print(f"  Accuracy: {baseline_accuracy:.4f} ({baseline_accuracy*100:.2f}%)")

print("\nAFTER GridSearch (optimized parameters):")
for param, value in grid_search.best_params_.items():
    param_name = param.replace('knn__', '')
    print(f"  {param_name}: {value}")
print(f"  Accuracy: {grid_accuracy:.4f} ({grid_accuracy*100:.2f}%)")

improvement = grid_accuracy - baseline_accuracy
print(f"\nImprovement: {improvement:.4f} ({improvement*100:.2f} percentage points)")

if improvement > 0:
    print("✅ GridSearch improved the model performance!")
elif improvement == 0:
    print("➖ No improvement from GridSearch")
else:
    print("⚠️ GridSearch resulted in lower performance (may indicate overfitting)")

In [None]:
# Detailed classification report untuk GridSearch
print("=== DETAILED PERFORMANCE ANALYSIS - GridSearch ====")
print("\nClassification Report (Best Model):")
print(classification_report(y_test, y_pred_grid, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix untuk GridSearch
cm_grid = confusion_matrix(y_test, y_pred_grid)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_grid, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - KNN with GridSearch")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# 🎛️ SKENARIO 3: SMOTE + GRIDSEARCH + KNN

Implementasi KNN dengan kombinasi SMOTE untuk mengatasi ketidakseimbangan kelas dan GridSearchCV untuk optimasi hyperparameter dalam klasifikasi tingkat stress.

## 3.1. SMOTE Application untuk Melihat Efek Balancing

In [None]:
# Apply preprocessing to see the effect of SMOTE clearly
X_train_processed = preprocessor.fit_transform(X_train)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)

print("=== COMPARISON BEFORE vs AFTER SMOTE ====")
print("\nBEFORE SMOTE:")
for i, count in enumerate(train_dist):
    print(f"{label_encoder.classes_[i]}: {count} samples")
print(f"Total: {len(y_train)} samples")

print("\nAFTER SMOTE:")
train_dist_after = pd.Series(y_train_smote).value_counts().sort_index()
for i, count in enumerate(train_dist_after):
    print(f"{label_encoder.classes_[i]}: {count} samples")
print(f"Total: {len(y_train_smote)} samples")

# Calculate increase
print("\nINCREASE PER CLASS:")
for i in range(len(label_encoder.classes_)):
    before = train_dist.iloc[i] if i < len(train_dist) else 0
    after = train_dist_after.iloc[i] if i < len(train_dist_after) else 0
    increase = after - before
    print(f"{label_encoder.classes_[i]}: +{increase} samples (from {before} to {after})")

In [None]:
# Visualize class distribution before and after SMOTE
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Before SMOTE
before_data = pd.DataFrame({
    'Class': [label_encoder.classes_[i] for i in train_dist.index],
    'Count': train_dist.values
})
sns.barplot(data=before_data, x='Class', y='Count', ax=ax1)
ax1.set_title('Class Distribution BEFORE SMOTE')
ax1.tick_params(axis='x', rotation=45)

# After SMOTE
after_data = pd.DataFrame({
    'Class': [label_encoder.classes_[i] for i in train_dist_after.index],
    'Count': train_dist_after.values
})
sns.barplot(data=after_data, x='Class', y='Count', ax=ax2)
ax2.set_title('Class Distribution AFTER SMOTE')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 3.2. Baseline Model dengan SMOTE (Before GridSearch)

In [None]:
# Create baseline SMOTE + KNN pipeline (default parameters)
baseline_smote_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier())
])

# Train baseline model
print("Training baseline SMOTE + KNN model...")
baseline_smote_pipeline.fit(X_train, y_train)
baseline_smote_pred = baseline_smote_pipeline.predict(X_test)
baseline_smote_accuracy = accuracy_score(y_test, baseline_smote_pred)

print("=== BASELINE MODEL (BEFORE GridSearch) ====")
print("Default hyperparameters:")
knn_params_smote = baseline_smote_pipeline.named_steps['knn'].get_params()
important_params = ['n_neighbors', 'weights', 'algorithm', 'metric', 'p']
for param in important_params:
    if param in knn_params_smote:
        print(f"  {param}: {knn_params_smote[param]}")

print(f"\nBaseline accuracy with SMOTE: {baseline_smote_accuracy:.4f} ({baseline_smote_accuracy*100:.2f}%)")

## 3.3. GridSearchCV untuk Optimasi Hyperparameter dengan SMOTE

In [None]:
# Create pipeline for GridSearch with SMOTE
grid_smote_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier())
])

# Perform GridSearchCV
print("Starting GridSearchCV with SMOTE + KNN...")
print("This may take several minutes...")

start_time_smote = time.time()

grid_search_smote = GridSearchCV(
    grid_smote_pipeline,
    param_grid,
    cv=5,  # 5-fold cross validation
    scoring='accuracy',
    n_jobs=1,  # Sequential processing (avoids pickling issues with ImbPipeline)
    verbose=1
)

grid_search_smote.fit(X_train, y_train)

end_time_smote = time.time()
elapsed_time_smote = end_time_smote - start_time_smote

print(f"\n✅ GridSearchCV dengan SMOTE completed in {elapsed_time_smote:.2f} seconds")

## 3.4. GridSearch + SMOTE Results

In [None]:
# Display GridSearch results
print("=== GRIDSEARCH + SMOTE RESULTS ====")
print("\nBest parameters found:")
for param, value in grid_search_smote.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest cross-validation score: {grid_search_smote.best_score_:.4f} ({grid_search_smote.best_score_*100:.2f}%)")

# Test the best model
best_model_smote = grid_search_smote.best_estimator_
y_pred_grid_smote = best_model_smote.predict(X_test)
grid_smote_accuracy = accuracy_score(y_test, y_pred_grid_smote)

print(f"Test set accuracy with best SMOTE + GridSearch model: {grid_smote_accuracy:.4f} ({grid_smote_accuracy*100:.2f}%)")

In [None]:
# Compare Before vs After GridSearch (both with SMOTE)
print("=== COMPARISON: BEFORE vs AFTER GridSearch (dengan SMOTE) ====")
print("\nBEFORE GridSearch (SMOTE + default KNN parameters):")
print(f"  n_neighbors: 5")
print(f"  weights: uniform")
print(f"  algorithm: auto")
print(f"  metric: minkowski")
print(f"  p: 2")
print(f"  Accuracy: {baseline_smote_accuracy:.4f} ({baseline_smote_accuracy*100:.2f}%)")

print("\nAFTER GridSearch (SMOTE + optimized KNN parameters):")
for param, value in grid_search_smote.best_params_.items():
    param_name = param.replace('knn__', '')
    print(f"  {param_name}: {value}")
print(f"  Accuracy: {grid_smote_accuracy:.4f} ({grid_smote_accuracy*100:.2f}%)")

improvement_smote = grid_smote_accuracy - baseline_smote_accuracy
print(f"\nImprovement: {improvement_smote:.4f} ({improvement_smote*100:.2f} percentage points)")

if improvement_smote > 0:
    print("✅ GridSearch improved SMOTE + KNN model performance!")
elif improvement_smote == 0:
    print("➖ No improvement from GridSearch")
else:
    print("⚠️ GridSearch resulted in lower performance (may indicate overfitting)")

In [None]:
# Detailed classification report untuk SMOTE + GridSearch
print("=== DETAILED PERFORMANCE ANALYSIS - SMOTE + GridSearch ====")
print("\nClassification Report (Best SMOTE + GridSearch Model):")
print(classification_report(y_test, y_pred_grid_smote, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix untuk SMOTE + GridSearch
cm_grid_smote = confusion_matrix(y_test, y_pred_grid_smote)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_grid_smote, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - SMOTE + GridSearch + KNN")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# 📊 FINAL COMPARISON - SEMUA SKENARIO

Perbandingan lengkap antara ketiga skenario KNN yang telah diimplementasikan.

## Comparison Akurasi Semua Model

In [None]:
print("=== COMPARISON: SEMUA SKENARIO KNN ====")
print(f"1. KNN Murni (k=5): {accuracy_murni:.4f} ({accuracy_murni*100:.2f}%)")
print(f"2. KNN Murni Best K (k={best_k_murni}): {best_accuracy_murni:.4f} ({best_accuracy_murni*100:.2f}%)")
print(f"3. KNN + GridSearch: {grid_accuracy:.4f} ({grid_accuracy*100:.2f}%)")
print(f"4. KNN + SMOTE: {baseline_smote_accuracy:.4f} ({baseline_smote_accuracy*100:.2f}%)")
print(f"5. SMOTE + GridSearch + KNN: {grid_smote_accuracy:.4f} ({grid_smote_accuracy*100:.2f}%)")

print(f"\nImprovement Analysis:")
print(f"KNN Murni Best K vs KNN Murni (k=5): {(best_accuracy_murni - accuracy_murni)*100:+.2f} pp")
print(f"KNN + GridSearch vs KNN Murni: {(grid_accuracy - accuracy_murni)*100:+.2f} pp")
print(f"KNN + SMOTE vs KNN Murni: {(baseline_smote_accuracy - accuracy_murni)*100:+.2f} pp")
print(f"SMOTE + GridSearch vs KNN Murni: {(grid_smote_accuracy - accuracy_murni)*100:+.2f} pp")
print(f"SMOTE + GridSearch vs KNN + GridSearch: {(grid_smote_accuracy - grid_accuracy)*100:+.2f} pp")

In [None]:
# Visualize comparison semua model
models = ['KNN Murni\n(k=5)', f'KNN Murni\n(k={best_k_murni})', 'KNN +\nGridSearch', 'KNN +\nSMOTE', 'SMOTE +\nGridSearch +\nKNN']
accuracies = [accuracy_murni, best_accuracy_murni, grid_accuracy, baseline_smote_accuracy, grid_smote_accuracy]
colors = ['lightblue', 'blue', 'orange', 'lightgreen', 'darkgreen']

plt.figure(figsize=(14, 8))
bars = plt.bar(models, accuracies, color=colors, alpha=0.7)
plt.title('Performance Comparison - All KNN Scenarios', fontsize=16, fontweight='bold')
plt.ylabel('Accuracy', fontsize=12)
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=10)

plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## Confusion Matrix Comparison

In [None]:
# Compare confusion matrices
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Confusion Matrix Comparison - All KNN Scenarios', fontsize=16, fontweight='bold')

# KNN Murni (k=5)
sns.heatmap(cm_murni, annot=True, fmt="d", cmap="Blues", ax=axes[0,0],
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
axes[0,0].set_title(f'KNN Murni (k=5)\nAccuracy: {accuracy_murni:.4f}')

# KNN Murni Best K
knn_pipeline.set_params(knn__n_neighbors=best_k_murni)
knn_pipeline.fit(X_train, y_train)
y_pred_best_k = knn_pipeline.predict(X_test)
cm_best_k = confusion_matrix(y_test, y_pred_best_k)
sns.heatmap(cm_best_k, annot=True, fmt="d", cmap="Blues", ax=axes[0,1],
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
axes[0,1].set_title(f'KNN Murni (k={best_k_murni})\nAccuracy: {best_accuracy_murni:.4f}')

# KNN + GridSearch
sns.heatmap(cm_grid, annot=True, fmt="d", cmap="Blues", ax=axes[0,2],
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
axes[0,2].set_title(f'KNN + GridSearch\nAccuracy: {grid_accuracy:.4f}')

# KNN + SMOTE
cm_smote_only = confusion_matrix(y_test, baseline_smote_pred)
sns.heatmap(cm_smote_only, annot=True, fmt="d", cmap="Blues", ax=axes[1,0],
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
axes[1,0].set_title(f'KNN + SMOTE\nAccuracy: {baseline_smote_accuracy:.4f}')

# SMOTE + GridSearch + KNN
sns.heatmap(cm_grid_smote, annot=True, fmt="d", cmap="Blues", ax=axes[1,1],
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
axes[1,1].set_title(f'SMOTE + GridSearch + KNN\nAccuracy: {grid_smote_accuracy:.4f}')

# Hide the last subplot
axes[1,2].axis('off')

plt.tight_layout()
plt.show()

## Summary dan Analisis Hyperparameter

In [None]:
print("=" * 80)
print("FINAL SUMMARY - FULL KNN SCENARIOS ANALYSIS")
print("=" * 80)

print("\n📊 DATASET INFORMATION:")
print(f"   Total samples: {len(dataset)}")
print(f"   Features used: {len(available_features)}")
print(f"   Classes: {len(label_encoder.classes_)} ({', '.join(label_encoder.classes_)})")
print(f"   Train/Test split: {len(X_train)}/{len(X_test)} samples")

print("\n🎯 SCENARIO RESULTS:")
print(f"   1. KNN Murni (k=5): {accuracy_murni:.4f} ({accuracy_murni*100:.2f}%)")
print(f"   2. KNN Murni Best K (k={best_k_murni}): {best_accuracy_murni:.4f} ({best_accuracy_murni*100:.2f}%)")
print(f"   3. KNN + GridSearch: {grid_accuracy:.4f} ({grid_accuracy*100:.2f}%)")
print(f"   4. KNN + SMOTE: {baseline_smote_accuracy:.4f} ({baseline_smote_accuracy*100:.2f}%)")
print(f"   5. SMOTE + GridSearch + KNN: {grid_smote_accuracy:.4f} ({grid_smote_accuracy*100:.2f}%)")

# Find best performing model
all_accuracies = [accuracy_murni, best_accuracy_murni, grid_accuracy, baseline_smote_accuracy, grid_smote_accuracy]
all_models = ['KNN Murni (k=5)', f'KNN Murni (k={best_k_murni})', 'KNN + GridSearch', 'KNN + SMOTE', 'SMOTE + GridSearch + KNN']
best_idx = np.argmax(all_accuracies)
best_model_name = all_models[best_idx]
best_accuracy = all_accuracies[best_idx]

print(f"\n🏆 BEST PERFORMING MODEL: {best_model_name}")
print(f"   Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")

print("\n⚙️ OPTIMAL HYPERPARAMETERS:")
print("   KNN + GridSearch:")
for param, value in grid_search.best_params_.items():
    param_name = param.replace('knn__', '')
    print(f"     {param_name}: {value}")

print("   SMOTE + GridSearch + KNN:")
for param, value in grid_search_smote.best_params_.items():
    param_name = param.replace('knn__', '')
    print(f"     {param_name}: {value}")

print("\n📈 KEY INSIGHTS:")
smote_effect = baseline_smote_accuracy - accuracy_murni
gridsearch_effect = grid_accuracy - accuracy_murni
combined_effect = grid_smote_accuracy - accuracy_murni

print(f"   • SMOTE effect alone: {smote_effect*100:+.2f} percentage points")
print(f"   • GridSearch effect alone: {gridsearch_effect*100:+.2f} percentage points")
print(f"   • Combined SMOTE + GridSearch effect: {combined_effect*100:+.2f} percentage points")
print(f"   • Best manual k-tuning: k={best_k_murni} (+{(best_accuracy_murni-accuracy_murni)*100:.2f} pp)")

if combined_effect > max(smote_effect, gridsearch_effect):
    print("   ✅ Combining SMOTE + GridSearch provides the best results!")
elif gridsearch_effect > smote_effect:
    print("   ✅ GridSearch optimization is more effective than SMOTE balancing")
else:
    print("   ✅ SMOTE balancing is more effective than GridSearch optimization")

print("\n🔍 COMPUTATIONAL COMPLEXITY:")
if 'elapsed_time' in locals():
    print(f"   • GridSearch time: {elapsed_time:.2f} seconds")
if 'elapsed_time_smote' in locals():
    print(f"   • SMOTE + GridSearch time: {elapsed_time_smote:.2f} seconds")
print(f"   • Total combinations tested: {np.prod([len(v) for v in param_grid.values()])}")

print("\n" + "=" * 80)