In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.datasets import load_iris




In [2]:
# --- When to Use KNN Imputation ---
# KNN Imputation is a method to fill in missing values in a dataset.
# It works by finding the 'k' most similar data points (neighbors) in the dataset
# for a row with a missing value and then using the values from those neighbors
# to impute the missing one.
#
# When to use it:
# - It's effective when the data is missing at random (MAR) or missing completely at random (MCAR).
# - It can be more accurate than simple mean/median/mode imputation, especially if the
#   features are correlated.
#
# Requirements:
# - The data must be numeric. KNN calculates distances between data points, which requires
#   numerical features. If you have categorical data, you must encode it first.
# - It can be computationally expensive on large datasets because it needs to calculate
#   the distance between all data points.



In [3]:
# 1. Load a sample dataset from scikit-learn
# We'll use the Iris dataset for this demonstration.
iris = load_iris()
# Convert to a pandas DataFrame for easier manipulation.
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
print("Original DataFrame head:")
print(df.head())
print("\n")



Original DataFrame head:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2




In [4]:
# 2. Artificially introduce missing values for demonstration
# We will randomly set some values in 'sepal width (cm)' and 'petal length (cm)' to NaN.
df_missing = df.copy()
missing_indices_sepal = np.random.choice(df.index, size=20, replace=False)
missing_indices_petal = np.random.choice(df.index, size=20, replace=False)
df_missing.loc[missing_indices_sepal, 'sepal width (cm)'] = np.nan
df_missing.loc[missing_indices_petal, 'petal length (cm)'] = np.nan

print("DataFrame with missing values (head):")
print(df_missing.head())
print("\nNumber of missing values per column:")
print(df_missing.isnull().sum())
print("\n")




DataFrame with missing values (head):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                NaN               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Number of missing values per column:
sepal length (cm)     0
sepal width (cm)     20
petal length (cm)    20
petal width (cm)      0
dtype: int64




In [8]:
# 3. Hyperparameter Tuning (Improved)
# The most important hyperparameter for KNNImputer is `n_neighbors` (k).
# We will test several 'k' values on a subset of our missing data to see which
# 'k' produces imputed values closest to the original, true values.

# Combine all indices where we introduced NaNs
all_missing_indices = np.union1d(missing_indices_sepal, missing_indices_petal)

k_values = [1, 3, 5, 7, 9, 11, 15]
tuning_results = []

# Iterate over each k value to test
for k in k_values:
    imputer = KNNImputer(n_neighbors=k)
    df_imputed_temp = pd.DataFrame(imputer.fit_transform(df_missing), columns=df.columns)
    
    # For each missing index, find the imputed value and compare it to the original
    for idx in all_missing_indices:
        # Check if the sepal width was missing for this index
        if idx in missing_indices_sepal:
            original_val = df.loc[idx, 'sepal width (cm)']
            imputed_val = df_imputed_temp.loc[idx, 'sepal width (cm)']
            error = abs(original_val - imputed_val)
            tuning_results.append({
                'k': k,
                'index': idx,
                'feature': 'sepal width (cm)',
                'original_value': original_val,
                'imputed_value': imputed_val,
                'error': error
            })
        # Check if the petal length was missing for this index
        if idx in missing_indices_petal:
            original_val = df.loc[idx, 'petal length (cm)']
            imputed_val = df_imputed_temp.loc[idx, 'petal length (cm)']
            error = abs(original_val - imputed_val)
            tuning_results.append({
                'k': k,
                'index': idx,
                'feature': 'petal length (cm)',
                'original_value': original_val,
                'imputed_value': imputed_val,
                'error': error
            })

# Display the detailed results of our manual grid search
tuning_df = pd.DataFrame(tuning_results)
print("--- Hyperparameter Tuning Results (Detailed) ---")
print(tuning_df.head(10))
print("\n")

# Calculate the Mean Absolute Error (MAE) for each k to find the best one
mae_by_k = tuning_df.groupby('k')['error'].mean().reset_index()
print("--- Mean Absolute Error by k ---")
print(mae_by_k)
print("\n")

# Choose the k with the lowest mean absolute error
best_k = mae_by_k.sort_values('error').iloc[0]['k'].astype('int')
print(f"Chosen best k based on lowest MAE: {best_k}\n")




--- Hyperparameter Tuning Results (Detailed) ---
   k  index            feature  original_value  imputed_value  error
0  1      1  petal length (cm)             1.4            1.5    0.1
1  1     16  petal length (cm)             1.3            1.7    0.4
2  1     32  petal length (cm)             1.5            1.4    0.1
3  1     38   sepal width (cm)             3.0            3.2    0.2
4  1     40   sepal width (cm)             3.5            3.4    0.1
5  1     40  petal length (cm)             1.3            1.6    0.3
6  1     41   sepal width (cm)             2.3            3.4    1.1
7  1     42  petal length (cm)             1.3            1.3    0.0
8  1     47   sepal width (cm)             3.2            3.1    0.1
9  1     47  petal length (cm)             1.4            1.5    0.1


--- Mean Absolute Error by k ---
    k     error
0   1  0.282500
1   3  0.214167
2   5  0.212000
3   7  0.216429
4   9  0.213611
5  11  0.221591
6  15  0.218333


Chosen best k based on lowe

In [9]:
# 4. Apply the K-Nearest Neighbors (KNN) Imputation with the Best Hyperparameter
# Initialize the KNNImputer with our chosen number of neighbors.
imputer = KNNImputer(n_neighbors=best_k)

# Apply the imputer to the dataset.
df_imputed_array = imputer.fit_transform(df_missing)

# Convert the resulting NumPy array back to a pandas DataFrame.
df_imputed = pd.DataFrame(data=df_imputed_array, columns=df.columns)




In [10]:
# 5. Show the result of applying the technique
# We will print the head of the DataFrame before and after imputation to see the changes.
print(f"--- Results using k={best_k} ---")
print("\nDataFrame with missing values (head):")
print(df_missing.head())
print("\nDataFrame after KNN Imputation (head):")
print(df_imputed.head())

# Verify that there are no more missing values
print("\nNumber of missing values after imputation:")
print(df_imputed.isnull().sum())
print("\n")



--- Results using k=5 ---

DataFrame with missing values (head):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                NaN               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

DataFrame after KNN Imputation (head):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5               1.40               0.2
1                4.9               3.0               1.52               0.2
2                4.7               3.2               1.30               0.2
3                4.6               3.1               1.50               0.2
4                5.0               3.6               1.40  

In [11]:
# 6. Compare Original vs. Imputed Values
# Create a DataFrame to compare the original values with the imputed ones.
comparison_data = []

# For sepal width
for idx in missing_indices_sepal:
    original_val = df.loc[idx, 'sepal width (cm)']
    imputed_val = df_imputed.loc[idx, 'sepal width (cm)']
    comparison_data.append({
        'index': idx,
        'feature': 'sepal width (cm)',
        'original_value': original_val,
        'imputed_value': imputed_val
    })

# For petal length
for idx in missing_indices_petal:
    original_val = df.loc[idx, 'petal length (cm)']
    imputed_val = df_imputed.loc[idx, 'petal length (cm)']
    comparison_data.append({
        'index': idx,
        'feature': 'petal length (cm)',
        'original_value': original_val,
        'imputed_value': imputed_val
    })

comparison_df = pd.DataFrame(comparison_data)
print("--- Final Comparison of Original vs. Imputed Values ---")
print(comparison_df.head(10))

--- Final Comparison of Original vs. Imputed Values ---
   index           feature  original_value  imputed_value
0     63  sepal width (cm)             2.9           2.70
1     70  sepal width (cm)             3.2           2.74
2    113  sepal width (cm)             2.5           2.86
3    145  sepal width (cm)             3.0           3.12
4    101  sepal width (cm)             2.7           2.84
5     40  sepal width (cm)             3.5           3.50
6    143  sepal width (cm)             3.2           3.18
7     65  sepal width (cm)             3.1           2.98
8     90  sepal width (cm)             2.6           2.86
9     38  sepal width (cm)             3.0           3.12
