### 02_Preprocessing: Missing-Value Handling & Dataset Splits

In this notebook we take the raw Pima dataset and produce three versions for modeling:
- **dfA**: drop any row with a physiologically impossible zero
- **dfB**: replace zeros with NaN, then **median**-impute
- **dfC**: replace zeros with NaN, then **KNN**-impute


In [1]:
# 1) Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

In [15]:
# Load and display Diabetes dataframe
df = pd.read_csv('diabetes.csv')

display(df.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
# Identify columns with missing values
missing_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
print("Zeros Per Column:")
print((df[missing_cols]==0).sum())

Zeros Per Column:
Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64


In [17]:
# Define columns with missing values
missing_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

# Define all feature columns to be used
feature_cols = [
    'Pregnancies','Glucose','BloodPressure',
    'SkinThickness','Insulin','BMI',
    'DiabetesPedigreeFunction','Age'
]

# Define Outcome column
target_col = 'Outcome'

### Creating dataframe for Trial A (dropping all rows with missing values (0's) in these key columns)

In [22]:
# Trial A: Drop any row where any missing_col == 0
dfA_mask = ~(df[missing_cols] == 0).any(axis=1)
dfA = df.loc[dfA_mask, feature_cols + [target_col]].reset_index(drop=True)

# How many rows dropped from total df for Trial A dataframe
n_keep = dfA_mask.sum()
n_dropped = (~dfA_mask).sum()

print(f"Trial A Dropped Rows: {n_dropped}")
print(f"Trial A Kept Rows: {n_keep}")
print(f"dfA (drop zeros) shape: {dfA.shape}")

Trial A Dropped Rows: 376
Trial A Kept Rows: 392
dfA (drop zeros) shape: (392, 9)


### Creating dataframe for Trial B (Median imputation on zeros).

In [25]:
# Trial B: Median imputation on zeros -> NaN -> median fill
dfB_filled = df.copy()
dfB_filled[missing_cols] = dfB_filled[missing_cols].replace(0, np.nan)

# Set up Imputer
med_imp = SimpleImputer(strategy='median')

# Only impute feature columns, keep Outcome unchanged
dfB_filled[feature_cols] = med_imp.fit_transform(dfB[feature_cols])
print(f"dfB_filled (median impute) shape: {dfB_filled.shape}")

dfB_filled (median impute) shape: (768, 9)


### Creating dataframe for Trial C (KNN Imputation on zeros). 

In [26]:
# Trial C: KNN Imputation on zeros -> NaN -> KNN fill
dfC_filled = df.copy()
dfC_filled[missing_cols] = dfC_filled[missing_cols].replace(0, np.nan)

# Set up imputer
knn_imp = KNNImputer(n_neighbors=5)

# Only impute features columns, keep Outcome unchanged
dfC_filled[feature_cols] = knn_imp.fit_transform(dfC[feature_cols])
print(f"dfC_filled (KNN impute) shape: {dfC_filled.shape}")

dfC_filled (KNN impute) shape: (768, 9)


In [27]:
# Check to see if all zeros/NaNs are replaced in each dataframe
for name, d in [('A', dfA), ('B', dfB_filled), ('C', dfC_filled)]:
    zeros = (d[missing_cols] == 0).sum().sum()
    nans  = d[feature_cols].isna().sum().sum()
    print(f"Trial {name}: zeros remaining={zeros}, NaNs remaining={nans}")

Trial A: zeros remaining=0, NaNs remaining=0
Trial B: zeros remaining=0, NaNs remaining=0
Trial C: zeros remaining=0, NaNs remaining=0
