# KNN Murni - Stress Level Detection

Implementasi KNN dasar tanpa optimasi tambahan untuk klasifikasi tingkat stress.

In [None]:
!pip install --upgrade imbalanced-learn
!pip install scikit-learn
!pip install pandas
!pip install matplotlib
!pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("✅ Library berhasil diimport")

## 1. Data Loading dan Preprocessing

In [None]:
FILE_PATH = './dataset/fix dataset 1031.csv'

# Read CSV with semicolon as separator and handle mixed decimal separators
df = pd.read_csv(FILE_PATH, sep=';', decimal='.')
dataset = df.copy()

# Tampilkan Semua row pada kolom pertama yang memiliki nilai NaN
print("📊 DATASET INFORMATION:")
print("Jumlah baris yang memiliki nilai NaN pada kolom pertama:", dataset[dataset.columns[0]].isna().sum())

# Bersihkan data dengan menghapus baris yang memiliki nilai NaN pada kolom pertama
dataset = dataset.dropna(subset=[dataset.columns[0]])

print("Dataset shape:", dataset.shape)
display(dataset.head())

In [None]:
# Check for missing values
print("Missing values per column:")
nan_counts = dataset.isna().sum()
display(nan_counts[nan_counts > 0])

In [None]:
# Fill missing values in Sleep Disorder with 'Normal'
dataset['Sleep Disorder'] = dataset['Sleep Disorder'].fillna('Normal')

# Split Blood Pressure column
if 'Blood Pressure' in dataset.columns:
    dataset[['Systolic', 'Diastolic']] = dataset['Blood Pressure'].str.split('/', expand=True)
    dataset['Systolic'] = pd.to_numeric(dataset['Systolic'], errors='coerce')
    dataset['Diastolic'] = pd.to_numeric(dataset['Diastolic'], errors='coerce')
    dataset = dataset.drop('Blood Pressure', axis=1)

# Clean numeric columns
kolom_numerik = ["Sleep Duration", "Heart Rate", "Daily Steps", "Systolic", "Diastolic"]
for col in kolom_numerik:
    if col in dataset.columns:
        dataset[col] = dataset[col].apply(lambda x: str(x).replace(',', '.') if isinstance(x, str) else x)
        dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

print("✅ Data preprocessing selesai")
display(dataset.head())

## 2. Target Encoding dan Feature Selection

In [None]:
# Label encoding for target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(dataset['Sleep Disorder'])

print("Target classes:", label_encoder.classes_)
print("Encoded values:", np.unique(target_encoded))

# Show class distribution
print("\n=== DISTRIBUSI KELAS ORIGINAL ====")
class_counts = pd.Series(target_encoded).value_counts().sort_index()
for i, count in enumerate(class_counts):
    print(f"{label_encoder.classes_[i]}: {count} samples")

# Visualize class distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=dataset, x='Sleep Disorder')
plt.title('Distribusi Kelas Target (Original)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Select features
feature_columns = ["Gender", "Age", "Occupation", "Sleep Duration", "Quality of Sleep",
                  "Physical Activity Level", "Stress Level", "BMI Category", "Systolic", "Diastolic"]

# Filter only existing columns
available_features = [col for col in feature_columns if col in dataset.columns]
features = dataset[available_features]

print("Selected features:", available_features)
print("Features shape:", features.shape)
display(features.head())

## 3. Data Splitting

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    features, target_encoded, test_size=0.2, random_state=42, stratify=target_encoded
)

print("=== DATA SPLIT ====")
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

print("\nDistribusi y_train:")
train_dist = pd.Series(y_train).value_counts().sort_index()
for i, count in enumerate(train_dist):
    print(f"{label_encoder.classes_[i]}: {count} samples")

print("\nDistribusi y_test:")
test_dist = pd.Series(y_test).value_counts().sort_index()
for i, count in enumerate(test_dist):
    print(f"{label_encoder.classes_[i]}: {count} samples")

## 4. Pipeline Setup

In [None]:
# Define numerical and categorical features
numerical_features = [col for col in available_features if features[col].dtype in ['int64', 'float64']]
categorical_features = [col for col in available_features if features[col].dtype == 'object']

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

# Create preprocessors
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

print("✅ Preprocessor pipeline created")

## 5. KNN Murni Training

In [None]:
# Create KNN pipeline with default parameters (k=5)
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier())
])

# Train the model
print("Training KNN model...")
knn_pipeline.fit(X_train, y_train)
print("✅ Model training completed")

## 6. Model Evaluation

In [None]:
# Make predictions
y_pred = knn_pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("=== HASIL EVALUASI KNN MURNI ====")
print(f"Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - KNN Murni")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

## 7. Testing Different K Values

In [None]:
# Test different k values
k_values = range(1, 21)
accuracies = []

print("Testing different k values...")
for k in k_values:
    knn_pipeline.set_params(knn__n_neighbors=k)
    knn_pipeline.fit(X_train, y_train)
    y_pred_k = knn_pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred_k)
    accuracies.append(acc)
    print(f"k={k}: Accuracy = {acc:.4f}")

# Find best k
best_k = k_values[np.argmax(accuracies)]
best_accuracy = max(accuracies)

print(f"\n=== BEST K VALUE ====")
print(f"Best k: {best_k}")
print(f"Best accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")

In [None]:
# Plot accuracy vs k
plt.figure(figsize=(10, 6))
plt.plot(k_values, accuracies, marker='o', linewidth=2, markersize=6)
plt.axvline(x=best_k, color='red', linestyle='--', alpha=0.7, label=f'Best k={best_k}')
plt.title('Accuracy vs. Number of Neighbors (k) - KNN Murni')
plt.xlabel('k (Number of Neighbors)')
plt.ylabel('Accuracy')
plt.grid(True, alpha=0.3)
plt.legend()
plt.xticks(k_values)
plt.tight_layout()
plt.show()

## 8. Final Model dengan Best K

In [None]:
# Train final model with best k
final_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier())
])

final_knn.fit(X_train, y_train)
final_pred = final_knn.predict(X_test)
final_accuracy = accuracy_score(y_test, final_pred)

print("=== FINAL MODEL PERFORMANCE ====")
print(f"Final model dengan k={best_k}")
print(f"Final accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print("\nFinal Classification Report:")
print(classification_report(y_test, final_pred, target_names=label_encoder.classes_))

## Summary

**KNN Murni Results:**
- Dataset menggunakan data asli tanpa balancing
- Best k value yang ditemukan melalui testing manual
- Akurasi final model dengan hyperparameter terbaik

In [None]:
# pip install --upgrade imbalanced-learn

In [None]:
# pip install pandas matplotlib seaborn

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import pandas as pd

FILE_PATH = './dataset/fix dataset 1031.csv'

# Membaca file CSV dengan delimiter ';'
df = pd.read_csv(FILE_PATH, delimiter=';')
dataset = df.copy()

# Menampilkan judul
print("Dataset Original")
print("==================================")

# Menampilkan tabel dengan format rapi
display(dataset.head())  # jika menggunakan Jupyter/Colab


In [None]:
# Menampilkan jumlah data NaN per kolom dalam bentuk tabel
nan_count = df.isna().sum()
nan_count_df = nan_count.reset_index()
nan_count_df.columns = ['Kolom', 'Jumlah NaN']
print("Jumlah data NaN per kolom:")
display(nan_count_df)

# Menampilkan hanya kolom yang memiliki NaN
nan_only = df[df.columns[df.isna().any()]].isna().sum()
nan_only_df = nan_only.reset_index()
nan_only_df.columns = ['Kolom', 'Jumlah NaN']
print("Kolom yang memiliki nilai NaN:")
display(nan_only_df)

In [None]:
# Ganti Label NaN ke Normal
print("Step 1: Mengisi kolom yang NaN menjadi Kategori Normal")
print("==================================")
dataset["Sleep Disorder"] = dataset["Sleep Disorder"].fillna("Normal")
display(dataset.head())


In [None]:
# Label Encoding untuk target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(dataset["Sleep Disorder"])

In [None]:
# Label encoding
print("\n--- Sebelum Encoding Target ---")

# Define the 'target' variable here
target = dataset["Sleep Disorder"] # Assign the original target column

print("Tipe data target:", target.dtype)
print("Nilai unik target:", target.unique())

#label_encoder = LabelEncoder() # This line is commented out, which is fine as it was already initialized
#target_encoded = label_encoder.fit_transform(target) # This line is also commented out, which is fine as target_encoded was created in the previous cell

print("\n--- Setelah Encoding Target ---")
# Ensure target_encoded exists before using it. It is created in the previous cell.
print("Tipe data target (setelah encoding):", target_encoded.dtype)
print("Nilai unik target (setelah encoding):", pd.Series(target_encoded).unique())
print("Mapping LabelEncoder:", list(label_encoder.classes_), "->", list(range(len(label_encoder.classes_))))

In [None]:
from IPython.display import display
import pandas as pd # Pastikan pandas diimpor jika cell ini dijalankan secara terpisah

# Memisahkan kolom 'Blood Pressure'
if 'Blood Pressure' in dataset.columns:
    dataset[['Systolic', 'Diastolic']] = dataset['Blood Pressure'].str.split('/', expand=True)
    dataset['Systolic'] = pd.to_numeric(dataset['Systolic'], errors='coerce')
    dataset['Diastolic'] = pd.to_numeric(dataset['Diastolic'], errors='coerce')
    # Hapus kolom 'blood pressure' yang asli jika sudah tidak diperlukan
    dataset = dataset.drop('Blood Pressure', axis=1)

    # Tampilkan dengan format tabel
    print("Dataset setelah memisahkan 'Blood Pressure':")
    print("=============================================")
    display(dataset.head())
else:
    print("'Blood Pressure' column not found in the dataset. It might have been processed already.")
    # You might want to display the current dataset head here as well
    display(dataset.head())


In [None]:
# *** Move the data cleaning code here ***
# Daftar kolom numerik yang perlu dibersihkan dari koma
kolom_numerik = ["Sleep Duration", "Heart Rate", "Daily Steps", "Systolic", "Diastolic"] # Added Systolic and Diastolic assuming they might have commas initially before split


In [None]:
# Memisahkan fitur dan target
features = dataset[["Gender", "Age", "Occupation", "Sleep Duration",
                     "Quality of Sleep", "Physical Activity Level",
                     "Stress Level", "BMI Category", "Systolic", "Diastolic"]]
target = dataset["Sleep Disorder"]

print("\nStep 2: Memisahkan fitur dan target")
print("==================================")
print("Features (head):")
display(features.head(7))
print("\nTarget (head):")
display(target.head(7))


In [None]:
numerical_column = 'Physical Activity Level' # Example: Replace with the actual numerical column name

# Check if the column is numeric before proceeding
if pd.api.types.is_numeric_dtype(df[numerical_column]):
    Q1 = df[numerical_column].quantile(0.25)
    Q3 = df[numerical_column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cari baris yang merupakan outlier
    outliers = df[(df[numerical_column] < lower_bound) | (df[numerical_column] > upper_bound)]

    print(f"Jumlah outlier pada kolom '{numerical_column}': {len(outliers)}")
    print(outliers)
else:
    print(f"Kolom '{numerical_column}' tidak bersifat numerik. Tidak dapat menghitung outlier menggunakan metode IQR.")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

print("\n--- Data Latih dan Uji ---")
print("Jumlah data latih:", len(X_train))
print("Jumlah data uji:", len(X_test))

print("Distribusi y_train:")
display(pd.Series(y_train).value_counts())

print("\nDistribusi y_test:")
display(pd.Series(y_test).value_counts())

In [None]:
# Mendefinisikan fitur numerik dan kategorikal
numerical_features = ["Age", "Sleep Duration", "Quality of Sleep",
                      "Physical Activity Level", "Stress Level","Systolic", "Diastolic"]
categorical_features = ["Gender", "Occupation", "BMI Category"]

#Mendefinisikan Preprocessor
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
display("Training Features (X_train):\n", X_train)
display("\nTraining Target (y_train):\n", y_train)