# KNN + SMOTE - Stress Level Detection

Implementasi KNN dengan SMOTE untuk mengatasi ketidakseimbangan kelas dalam klasifikasi tingkat stress.

In [None]:
!pip install --upgrade imbalanced-learn
!pip install --upgrade scikit-learn
!pip install pandas
!pip install matplotlib
!pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

print("✅ Library berhasil diimport")

## 1. Data Loading dan Preprocessing

In [None]:
FILE_PATH = './dataset/fix dataset 1031.csv'

# Read CSV with semicolon as separator and handle mixed decimal separators
df = pd.read_csv(FILE_PATH, sep=';', decimal='.')
dataset = df.copy()

# Tampilkan Semua row pada kolom pertama yang memiliki nilai NaN
print("📊 DATASET INFORMATION:")
print("Jumlah baris yang memiliki nilai NaN pada kolom pertama:", dataset[dataset.columns[0]].isna().sum())

# Bersihkan data dengan menghapus baris yang memiliki nilai NaN pada kolom pertama
dataset = dataset.dropna(subset=[dataset.columns[0]])

print("Dataset shape:", dataset.shape)
display(dataset.head())

In [None]:
# Fill missing values in Sleep Disorder with 'Normal'
dataset['Sleep Disorder'] = dataset['Sleep Disorder'].fillna('Normal')

# Split Blood Pressure column
if 'Blood Pressure' in dataset.columns:
    dataset[['Systolic', 'Diastolic']] = dataset['Blood Pressure'].str.split('/', expand=True)
    dataset['Systolic'] = pd.to_numeric(dataset['Systolic'], errors='coerce')
    dataset['Diastolic'] = pd.to_numeric(dataset['Diastolic'], errors='coerce')
    dataset = dataset.drop('Blood Pressure', axis=1)

# Clean numeric columns
kolom_numerik = ["Sleep Duration", "Heart Rate", "Daily Steps", "Systolic", "Diastolic"]
for col in kolom_numerik:
    if col in dataset.columns:
        dataset[col] = dataset[col].apply(lambda x: str(x).replace(',', '.') if isinstance(x, str) else x)
        dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

print("✅ Data preprocessing selesai")

## 2. Target Encoding dan Feature Selection

In [None]:
# Label encoding for target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(dataset['Sleep Disorder'])

print("Target classes:", label_encoder.classes_)
print("Encoded values:", np.unique(target_encoded))

# Show class distribution BEFORE SMOTE
print("\n=== DISTRIBUSI KELAS BEFORE SMOTE ====")
class_counts_before = pd.Series(target_encoded).value_counts().sort_index()
for i, count in enumerate(class_counts_before):
    print(f"{label_encoder.classes_[i]}: {count} samples")

total_before = len(target_encoded)
print(f"\nTotal samples before SMOTE: {total_before}")

In [None]:
# Select features
feature_columns = ["Gender", "Age", "Occupation", "Sleep Duration", "Quality of Sleep", 
                  "Physical Activity Level", "Stress Level", "BMI Category", "Systolic", "Diastolic"]

# Filter only existing columns
available_features = [col for col in feature_columns if col in dataset.columns]
features = dataset[available_features]

print("Selected features:", available_features)
print("Features shape:", features.shape)

## 3. Data Splitting

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    features, target_encoded, test_size=0.2, random_state=42, stratify=target_encoded
)

print("=== DATA SPLIT ====")
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

print("\nDistribusi y_train BEFORE SMOTE:")
train_dist_before = pd.Series(y_train).value_counts().sort_index()
for i, count in enumerate(train_dist_before):
    print(f"{label_encoder.classes_[i]}: {count} samples")

## 4. Pipeline Setup dengan SMOTE

In [None]:
# Define numerical and categorical features
numerical_features = [col for col in available_features if features[col].dtype in ['int64', 'float64']]
categorical_features = [col for col in available_features if features[col].dtype == 'object']

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

# Create preprocessors
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

print("✅ Preprocessor pipeline created")

## 5. SMOTE Application

In [None]:
# Apply preprocessing to see the effect of SMOTE clearly
X_train_processed = preprocessor.fit_transform(X_train)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)

print("=== COMPARISON BEFORE vs AFTER SMOTE ====")
print("\nBEFORE SMOTE:")
for i, count in enumerate(train_dist_before):
    print(f"{label_encoder.classes_[i]}: {count} samples")
print(f"Total: {len(y_train)} samples")

print("\nAFTER SMOTE:")
train_dist_after = pd.Series(y_train_smote).value_counts().sort_index()
for i, count in enumerate(train_dist_after):
    print(f"{label_encoder.classes_[i]}: {count} samples")
print(f"Total: {len(y_train_smote)} samples")

# Calculate increase
print("\nINCREASE PER CLASS:")
for i in range(len(label_encoder.classes_)):
    before = train_dist_before.iloc[i] if i < len(train_dist_before) else 0
    after = train_dist_after.iloc[i] if i < len(train_dist_after) else 0
    increase = after - before
    print(f"{label_encoder.classes_[i]}: +{increase} samples (from {before} to {after})")

In [None]:
# Visualize class distribution before and after SMOTE
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Before SMOTE
before_data = pd.DataFrame({
    'Class': [label_encoder.classes_[i] for i in train_dist_before.index],
    'Count': train_dist_before.values
})
sns.barplot(data=before_data, x='Class', y='Count', ax=ax1)
ax1.set_title('Class Distribution BEFORE SMOTE')
ax1.tick_params(axis='x', rotation=45)

# After SMOTE
after_data = pd.DataFrame({
    'Class': [label_encoder.classes_[i] for i in train_dist_after.index],
    'Count': train_dist_after.values
})
sns.barplot(data=after_data, x='Class', y='Count', ax=ax2)
ax2.set_title('Class Distribution AFTER SMOTE')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. KNN + SMOTE Training

In [None]:
# Create KNN + SMOTE pipeline
knn_smote_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5, n_jobs=-1))
])

# Train the model
print("Training KNN + SMOTE model...")
knn_smote_pipeline.fit(X_train, y_train)
print("✅ Model training completed")

## 7. Model Evaluation

In [None]:
# Make predictions
y_pred_smote = knn_smote_pipeline.predict(X_test)

# Calculate accuracy
accuracy_smote = accuracy_score(y_test, y_pred_smote)

print("=== HASIL EVALUASI KNN + SMOTE ====")
print(f"Akurasi: {accuracy_smote:.4f} ({accuracy_smote*100:.2f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_smote, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix
cm_smote = confusion_matrix(y_test, y_pred_smote)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_smote, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - KNN + SMOTE")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

## 8. Testing Different K Values dengan SMOTE

In [None]:
# Test different k values with SMOTE
k_values = range(1, 21)
accuracies_smote = []

print("Testing different k values with SMOTE...")
for k in k_values:
    knn_smote_pipeline.set_params(knn__n_neighbors=k)
    knn_smote_pipeline.fit(X_train, y_train)
    y_pred_k = knn_smote_pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred_k)
    accuracies_smote.append(acc)
    print(f"k={k}: Accuracy = {acc:.4f}")

# Find best k for SMOTE
best_k_smote = k_values[np.argmax(accuracies_smote)]
best_accuracy_smote = max(accuracies_smote)

print(f"\n=== BEST K VALUE untuk KNN + SMOTE ====")
print(f"Best k: {best_k_smote}")
print(f"Best accuracy: {best_accuracy_smote:.4f} ({best_accuracy_smote*100:.2f}%)")

In [None]:
# Plot accuracy vs k for SMOTE
plt.figure(figsize=(10, 6))
plt.plot(k_values, accuracies_smote, marker='o', linewidth=2, markersize=6, color='green')
plt.axvline(x=best_k_smote, color='red', linestyle='--', alpha=0.7, label=f'Best k={best_k_smote}')
plt.title('Accuracy vs. Number of Neighbors (k) - KNN + SMOTE')
plt.xlabel('k (Number of Neighbors)')
plt.ylabel('Accuracy')
plt.grid(True, alpha=0.3)
plt.legend()
plt.xticks(k_values)
plt.tight_layout()
plt.show()

## 9. Final Model dengan Best K

In [None]:
# Train final model with best k
final_knn_smote = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=best_k_smote))
])

final_knn_smote.fit(X_train, y_train)
final_pred_smote = final_knn_smote.predict(X_test)
final_accuracy_smote = accuracy_score(y_test, final_pred_smote)

print("=== FINAL MODEL PERFORMANCE (KNN + SMOTE) ====")
print(f"Final model dengan k={best_k_smote}")
print(f"Final accuracy: {final_accuracy_smote:.4f} ({final_accuracy_smote*100:.2f}%)")
print("\nFinal Classification Report:")
print(classification_report(y_test, final_pred_smote, target_names=label_encoder.classes_))

## 10. Comparison with Original Data

In [None]:
# Train KNN without SMOTE for comparison
knn_original = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier(n_neighbors=best_k_smote))
])

knn_original.fit(X_train, y_train)
pred_original = knn_original.predict(X_test)
accuracy_original = accuracy_score(y_test, pred_original)

print("=== COMPARISON: KNN vs KNN + SMOTE ====")
print(f"KNN Original (k={best_k_smote}): {accuracy_original:.4f} ({accuracy_original*100:.2f}%)")
print(f"KNN + SMOTE (k={best_k_smote}): {final_accuracy_smote:.4f} ({final_accuracy_smote*100:.2f}%)")
print(f"Improvement: {(final_accuracy_smote - accuracy_original)*100:.2f} percentage points")

## Summary

**KNN + SMOTE Results:**
- **Data before SMOTE**: Class distribution as shown above
- **Data after SMOTE**: Balanced class distribution
- **Best k value**: Determined through systematic testing
- **Performance improvement**: Comparison with original KNN
- **SMOTE Impact**: Addresses class imbalance by generating synthetic samples