# K-Nearest Neighbors (KNN) Classification – Lecture Notebook

This notebook demonstrates how to use **K-Nearest Neighbors (KNN)** for a **binary classification**
task using the IBM Employee Attrition dataset.

We will walk through:
1. Preparing predictors and target variables  
2. Scaling features for distance-based learning  
3. Training KNN classifiers with different values of *k*  
4. Visualizing decision boundaries (2D projection)  
5. Evaluating model performance  

> **Key idea:**  
> KNN classifies observations based on *distance in predictor space*.  
> Feature scaling and encoding choices directly affect which points are considered “neighbors”.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
df = pd.read_csv("ibm_attrition.csv")

# Dropping columns with no significant contribution.
df.drop(columns=["EmployeeCount", "EmployeeNumber", "StandardHours"], inplace=True)

print(df.shape)

df.head()

In [None]:
# Getting distribution of Attrition
df["Attrition"].value_counts()

In [None]:
# Encode target ("Attrition")


# [1, 0] = ["Yes", "No"]
label_map = {"No": 0, "Yes": 1}
df["Attrition"] = df["Attrition"].map(label_map)


# Or use LabelEncoder()
# target_encoder = LabelEncoder()
# df["Attrition"] = target_encoder.fit_transform(df["Attrition"])

# Sanity check to see if we still get the expected counts.
df["Attrition"].value_counts()

In [None]:
categorical_cols = df.select_dtypes(include="object").columns.tolist()
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("categorical columns that will be dropped:")
print(categorical_cols)
df.drop(columns=categorical_cols, inplace=True)

print(f"New df shape: {df.shape}")
df.head()

In [None]:
y = df["Attrition"]

X = df.drop(columns=["Attrition"])

In [None]:
# Splitting train and test sets. Note that we split data before scaling to prevent any leakage from
# test set to our training process.
# Also note the use of stratification.

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=2026, stratify=y
)

for v in [X_train, X_test, y_train, y_test]:
    print(v.shape)

In [None]:
# Scaling training and test set. Note that we ONLY use training set for calculating scaling parameters.

train_mean = X_train.mean(axis=0)
train_std = X_train.std(axis=0)

X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

# Alternatively you can use StandardScaler() class which does the same thing.
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# # You could also call .fit_transform() method once instead of two steps above.
# X_test_scaled = scaler.transform(X_test)


# We use np.isclose() and np.allclose() methods to account for computation precision.
# the actual values are very close to 0 but not exactly 0.
print(np.allclose(X_train_scaled.mean(axis=0), 0))
print(np.allclose(X_train_scaled.std(axis=0), 1))
print("\n")

# You can see that unlike training data,test data is not exactly standardized to mean of 0 and std of 1.
# This is expected as we used training dataset for calculating standardization parameters.
print(np.allclose(X_test_scaled.mean(axis=0), 0))
print(np.allclose(X_test_scaled.std(axis=0), 1))
X_test_scaled.columns[0]
print("\n")

# The use of stratification during splitting results in train and test splits with almost
# equal distribution on target (y).
print(f"Average for y_train: {y_train.mean():.3f} and for y_test: {y_test.mean():.3f}")
print(np.isclose(y_train.mean(), y_test.mean(), atol=0.01))

In [None]:
# A heatmap provides a good way of understanding multicollinearity between features.
# This helps with feature selection.abs
# Note that we are not using the test set to avoid leaking problem. 

# We first assemble back the training set by combining X and y values previously split.
train_df = pd.concat([X_train, y_train], axis=1)

# Then we calculate the Pearson correlation.abs
# Note tha Pearson correlation is location and scale invariant.abs
# You could take the correlation of non-scaled or scaled data and they will be equal.
corr = train_df.corr().round(2)

fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(train_df.drop(columns="Attrition").corr().round(1), annot=True, square=True, ax=ax)

In [None]:
# Filtering and storing the pairs of feaures that are highly correlated.
multicol_threshold = 0.5

mask = np.triu(np.ones(corr.shape), k=1).astype(bool)  # upper triangle, no diagonal
pairs = corr.where(mask).stack().reset_index()
pairs.columns = ["feat1", "feat2", "corr"]
pairs = pairs[pairs["corr"].abs() >= multicol_threshold].sort_values(by="feat1")

pairs

In [None]:
# Filtering out and storing features with high correlation to target (y).
high_cor_threshold = 0.1

highly_corr = corr.loc[corr["Attrition"].abs() >= high_cor_threshold, "Attrition"]
highly_corr

We select the final features we want to include based on previous steps. We want features that are not highly correlated but also potentially related to the target.

In [None]:
to_remove = ["TotalWorkingYears", "YearsInCurrentRole", "YearsAtCompany"]
extras = [
    "JobSatisfaction",
    "PerformanceRating",
    "RelationshipSatisfaction",
    "WorkLifeBalance",
]
retained_features = [f for f in highly_corr.index.to_list() if f not in to_remove]
retained_features.extend(extras)
retained_features

In [None]:
# Pairplot gives us a good indication of differential class distribution within each feature.

sns.pairplot(train_df[retained_features], hue="Attrition", height=2)

In [None]:
retained_features.remove("Attrition")

X_train_retained = X_train_scaled[retained_features]
X_test_retained = X_test_scaled[retained_features]

In [None]:
k_values = [3, 5, 7, 11, 15, 25, 50]
train_accuracies = []
test_accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_retained, y_train)

    train_pred = knn.predict(X_train_retained)
    test_pred = knn.predict(X_test_retained)

    train_accuracies.append(accuracy_score(y_train, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))

    # Alternatively instead of .predict() and accuracy_score() you could do it in one step using .score()
    # train_accuracies.append(knn.score(X_train_retained, y_train))
    # test_accuracies.append(knn.score(X_test_retained, y_test))

In [None]:
train_accuracies, test_accuracies

In [None]:
plt.plot(k_values, train_accuracies, marker="o", label="Train Accuracy")
plt.plot(k_values, test_accuracies, marker="o", label="Test Accuracy")


plt.xlabel("Number of Neighbors (K)")
plt.ylabel("Accuracy")
plt.title("KNN Accuracy vs K")
plt.legend()
plt.show()

In [None]:
def plot_decision_boundary(ax, X, y, k, feature_names, class_labels):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 300),
        np.linspace(y_min, y_max, 300),
    )

    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.3, cmap="coolwarm")
    ax.scatter(
        X[:, 0],
        X[:, 1],
        c=y,
        cmap="coolwarm",
        edgecolor="k",
        s=40,
    )

    ax.set_xlabel(f"{feature_names[0]} (Scaled)")
    ax.set_ylabel(f"{feature_names[1]} (Scaled)")
    ax.set_title(f"KNN Decision Boundary (k={k})")

In [None]:
k_values = [1, 3, 5, 7, 15, 25, 50]
selected_features = ["JobInvolvement", "Age"]
X_2d = df[selected_features]

scaler_2d = StandardScaler()

X_2d_scaled = scaler_2d.fit_transform(X_2d)

n_rows = (len(k_values) + 1) // 2
fig, axes = plt.subplots(n_rows, 2, figsize=(12, 4 * n_rows))
axes = axes.flatten()

for ax, k in zip(axes, k_values):
    plot_decision_boundary(
        ax=ax,
        X=X_2d_scaled,
        y=y,
        k=k,
        feature_names=selected_features,
        class_labels=["No Attrition", "Attrition"],
    )

# Hide unused axes
for ax in axes[len(k_values):]:
    ax.axis("off")

plt.tight_layout()

In [None]:
final_knn = KNeighborsClassifier(n_neighbors=7)
final_knn.fit(X_train_retained, y_train)


y_pred = final_knn.predict(X_test_retained)

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["No Attrition", "Attrition"],
    yticklabels=["No Attrition", "Attrition"]
)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
print(classification_report(y_test, y_pred, target_names=["No Attrition", "Attrition"]))

<!-- train_df = pd.concat() -->
