<a href="https://colab.research.google.com/github/asheta66/Data-Mining-2/blob/main/W3Lab1_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
# Univariate and Multivariate Imputation Techniques in Machine Learning

## Introduction
Imputing missing values is a crucial step in machine learning as many models do not support missing values. Removing missing data can introduce bias, so various imputation techniques are used.

## Creating a Sample DataFrame
Let's create a DataFrame with missing values to apply the imputation methods.
"""

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Creating a sample dataset with missing values
data = {
    'Age': [25, np.nan, 35, 40, np.nan, 50, 29, np.nan],
    'Salary': [50000, 54000, np.nan, 70000, 72000, np.nan, 48000, 51000],
    'Experience': [2, 5, np.nan, 10, 12, 7, np.nan, 3]
}
df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
df


Original DataFrame with Missing Values:


Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,,54000.0,5.0
2,35.0,,
3,40.0,70000.0,10.0
4,,72000.0,12.0
5,50.0,,7.0
6,29.0,48000.0,
7,,51000.0,3.0


In [None]:
"""
## Univariate Imputation
Univariate imputation considers only a single column to fill in missing values.
"""

"""
### Mean Imputation
Replaces missing values with the mean of the column.
"""
df.loc[:, 'Age'] = df['Age'].fillna(df['Age'].mean())
print("Mean Imputed Data:\n")
df

Mean Imputed Data:



Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,35.8,54000.0,5.0
2,35.0,,
3,40.0,70000.0,10.0
4,35.8,72000.0,12.0
5,50.0,,7.0
6,29.0,48000.0,
7,35.8,51000.0,3.0


In [None]:
"""
### Median Imputation
Replaces missing values with the median of the column.
"""
df.loc[:, 'Salary'] = df['Salary'].fillna(df['Salary'].median())
print("Median Imputed Data:\n")
df

Median Imputed Data:



Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,35.8,54000.0,5.0
2,35.0,52500.0,
3,40.0,70000.0,10.0
4,35.8,72000.0,12.0
5,50.0,52500.0,7.0
6,29.0,48000.0,
7,35.8,51000.0,3.0


In [None]:
"""
### Mode Imputation
Replaces missing values with the most frequent value.
"""
df.loc[:, 'Experience'] = df['Experience'].fillna(df['Experience'].mode()[0])
print("Mode Imputed Data:\n")
df

Mode Imputed Data:



Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,35.8,54000.0,5.0
2,35.0,52500.0,2.0
3,40.0,70000.0,10.0
4,35.8,72000.0,12.0
5,50.0,52500.0,7.0
6,29.0,48000.0,2.0
7,35.8,51000.0,3.0


In [None]:
"""
### Forward and Backward Fill
Uses previous or next known values to fill missing data.
"""
df.loc[:, 'Age'] = df['Age'].ffill()  # Forward Fill
df.loc[:, 'Salary'] = df['Salary'].bfill()  # Backward Fill
print("Forward/Backward Filled Data:\n")
df


Forward/Backward Filled Data:



Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,35.8,54000.0,5.0
2,35.0,52500.0,2.0
3,40.0,70000.0,10.0
4,35.8,72000.0,12.0
5,50.0,52500.0,7.0
6,29.0,48000.0,2.0
7,35.8,51000.0,3.0


In [None]:
"""
### Moving Average Imputation
Replaces missing values using the rolling window mean.
"""
df.loc[:, 'Experience'] = df['Experience'].fillna(df['Experience'].rolling(window=3, min_periods=1).mean())
print("Moving Average Imputed Data:\n")
df

Moving Average Imputed Data:



Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,35.8,54000.0,5.0
2,35.0,52500.0,2.0
3,40.0,70000.0,10.0
4,35.8,72000.0,12.0
5,50.0,52500.0,7.0
6,29.0,48000.0,2.0
7,35.8,51000.0,3.0


In [None]:
"""
## Multivariate Imputation
Multivariate imputation considers other features to predict missing values.

### KNN Imputer
Uses the average of k-nearest neighbors to fill missing values.
"""
knn_imputer = KNNImputer(n_neighbors=3)
df_knn = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)
print("KNN Imputed Data:\n")
df_knn

KNN Imputed Data:



Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,35.8,54000.0,5.0
2,35.0,52500.0,2.0
3,40.0,70000.0,10.0
4,35.8,72000.0,12.0
5,50.0,52500.0,7.0
6,29.0,48000.0,2.0
7,35.8,51000.0,3.0


In [None]:
"""
### Iterative Imputer
Predicts missing values using regression models.
"""
imputer = IterativeImputer()
df_iterative = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("Iterative Imputed Data:\n")
df_iterative

Iterative Imputed Data:



Unnamed: 0,Age,Salary,Experience
0,25.0,50000.0,2.0
1,35.8,54000.0,5.0
2,35.0,52500.0,2.0
3,40.0,70000.0,10.0
4,35.8,72000.0,12.0
5,50.0,52500.0,7.0
6,29.0,48000.0,2.0
7,35.8,51000.0,3.0
