In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'Age': [22, 25, np.nan, 30, 28, np.nan, 24],
    'Salary': [40000, 50000, 45000, np.nan, 55000, 60000, 48000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary
0,22.0,40000.0
1,25.0,50000.0
2,,45000.0
3,30.0,
4,28.0,55000.0
5,,60000.0
6,24.0,48000.0


In [3]:
df.isnull().sum()

Age       2
Salary    1
dtype: int64

# Using Mean

In [4]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

In [5]:
df

Unnamed: 0,Age,Salary
0,22.0,40000.0
1,25.0,50000.0
2,25.8,45000.0
3,30.0,49666.666667
4,28.0,55000.0
5,25.8,60000.0
6,24.0,48000.0


# Using Median

In [6]:
data = {
    'Age': [22, 25, np.nan, 30, 28, np.nan, 100],  # Notice the outlier '100'
    'Salary': [40000, 50000, 45000, np.nan, 55000, 60000, 48000]
}
df  = pd.DataFrame(data)

In [7]:
df

Unnamed: 0,Age,Salary
0,22.0,40000.0
1,25.0,50000.0
2,,45000.0
3,30.0,
4,28.0,55000.0
5,,60000.0
6,100.0,48000.0


In [8]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())

In [9]:
df

Unnamed: 0,Age,Salary
0,22.0,40000.0
1,25.0,50000.0
2,28.0,45000.0
3,30.0,49000.0
4,28.0,55000.0
5,28.0,60000.0
6,100.0,48000.0


# Using Mode

In [10]:
data = {
    'Gender': ['Male', 'Female', np.nan, 'Female', 'Male', 'Female', np.nan],
    'Department': ['HR', 'IT', 'IT', np.nan, 'HR', 'IT', 'HR']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Gender,Department
0,Male,HR
1,Female,IT
2,,IT
3,Female,
4,Male,HR
5,Female,IT
6,,HR


In [11]:
df.mode()

Unnamed: 0,Gender,Department
0,Female,HR
1,,IT


In [12]:
df['Gender'].mode()

0    Female
Name: Gender, dtype: object

In [13]:
df['Gender'].mode()[0]

'Female'

In [14]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Department'] = df['Department'].fillna(df['Department'].mode()[0])

In [15]:
df

Unnamed: 0,Gender,Department
0,Male,HR
1,Female,IT
2,Female,IT
3,Female,HR
4,Male,HR
5,Female,IT
6,Female,HR


# Removes Null Values

In [16]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, np.nan, 30, np.nan, 22],
    'Score': [85, 90, np.nan, 88, 92]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Score
0,Alice,25.0,85.0
1,Bob,,90.0
2,Charlie,30.0,
3,David,,88.0
4,Eve,22.0,92.0


In [17]:
df.dropna()

Unnamed: 0,Name,Age,Score
0,Alice,25.0,85.0
4,Eve,22.0,92.0


In [26]:
df = df.dropna(axis = 1)

In [20]:
df.dropna(thresh=2)  #Keeps rows with at least 2 non-NaN values

Unnamed: 0,Name,Age,Score
0,Alice,25.0,85.0
1,Bob,,90.0
2,Charlie,30.0,
3,David,,88.0
4,Eve,22.0,92.0


In [21]:
df.dropna(subset=['Age'])

Unnamed: 0,Name,Age,Score
0,Alice,25.0,85.0
2,Charlie,30.0,
4,Eve,22.0,92.0


# using KNN Imputer (with sklearn):


In [22]:
data = {
    'Age': [25, 27, np.nan, 35, 29, np.nan, 40],
    'Salary': [50000, 54000, 58000, np.nan, 62000, 60000, np.nan]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary
0,25.0,50000.0
1,27.0,54000.0
2,,58000.0
3,35.0,
4,29.0,62000.0
5,,60000.0
6,40.0,


In [23]:
from sklearn.impute import KNNImputer

In [24]:
imputer = KNNImputer(n_neighbors = 2)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
df_imputed.round(2)

Unnamed: 0,Age,Salary
0,25.0,50000.0
1,27.0,54000.0
2,28.0,58000.0
3,35.0,58000.0
4,29.0,62000.0
5,28.0,60000.0
6,40.0,58000.0


# MICE  -  Multiple Imputation by Chained Equations
Also called Multivariate Imputation.

It fills missing values multiple times, using predictions from other features — and does it iteratively.

✅ Pros of MICE:
Learns from relationships between features

Good for multivariate datasets

Works better than mean/median when features are correlated

⚠️ Cons:
Slower than KNN/mean/median

Can be tricky with lots of missing data

Doesn’t handle categorical data directly (you'll need encoding)

In [25]:

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# Sample DataFrame
data = {
    'Age': [25, np.nan, 30, 35],
    'Salary': [50000, 54000, np.nan, 62000],
    'Experience': [2, 3, 5, np.nan]
}
df = pd.DataFrame(data)

print("Original DataFrame:\n", df)

# MICE Imputer
imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("\nAfter MICE Imputation:\n", df_imputed.round(2))


Original DataFrame:
     Age   Salary  Experience
0  25.0  50000.0         2.0
1   NaN  54000.0         3.0
2  30.0      NaN         5.0
3  35.0  62000.0         NaN

After MICE Imputation:
      Age    Salary  Experience
0  25.00  50000.00        2.00
1  28.32  54000.00        3.00
2  30.00  55994.05        5.00
3  35.00  62000.00        7.33
