# Missing Data Handling in Python
This notebook covers different types of missing data handling techniques using Python with detailed examples.

In [14]:
!pip install pandas
!pip install matplotlib.pyplot
import matplotlib.pyplot as plt

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: C:\Users\panna\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement matplotlib.pyplot (from versions: none)

[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: C:\Users\panna\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for matplotlib.pyplot


ModuleNotFoundError: No module named 'matplotlib'

In [9]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For display purposes
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


ModuleNotFoundError: No module named 'matplotlib'

## Creating a Sample Dataset

In [10]:

# Create a sample dataset with various types of missing data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward'],
    'Age': [25, np.nan, 30, 22, np.nan],
    'Gender': ['F', 'M', np.nan, 'M', 'M'],
    'Income': [50000, 60000, np.nan, np.nan, 70000],
    'Department': ['HR', 'Finance', 'HR', 'Finance', None]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Name,Age,Gender,Income,Department
0,Alice,25.0,F,50000.0,HR
1,Bob,,M,60000.0,Finance
2,Charlie,30.0,,,HR
3,David,22.0,M,,Finance
4,Edward,,M,70000.0,


## Identifying Missing Data

In [11]:

df.isnull()


Unnamed: 0,Name,Age,Gender,Income,Department
0,False,False,False,False,False
1,False,True,False,False,False
2,False,False,True,True,False
3,False,False,False,True,False
4,False,True,False,False,True


In [12]:

df.isnull().sum()


Name          0
Age           2
Gender        1
Income        2
Department    1
dtype: int64

In [13]:

sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.show()


NameError: name 'sns' is not defined

## Handling Missing Data by Dropping

In [None]:

# Drop rows with any missing values
df_drop_any = df.dropna()
df_drop_any


In [None]:

# Drop rows only if all columns are NA
df_drop_all = df.dropna(how='all')
df_drop_all


In [None]:

# Drop columns with any missing values
df_drop_cols = df.dropna(axis=1)
df_drop_cols


## Handling Missing Data by Filling

In [None]:

# Fill with a specific value
df_fill_value = df.fillna(value={'Age': 0, 'Income': df['Income'].mean()})
df_fill_value


In [None]:

# Forward fill
df_ffill = df.fillna(method='ffill')
df_ffill


In [None]:

# Backward fill
df_bfill = df.fillna(method='bfill')
df_bfill


## Imputation using Scikit-learn

In [None]:

from sklearn.impute import SimpleImputer

# Numerical columns
num_imputer = SimpleImputer(strategy='mean')
df[['Age', 'Income']] = num_imputer.fit_transform(df[['Age', 'Income']])

# Categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
df[['Gender', 'Department']] = cat_imputer.fit_transform(df[['Gender', 'Department']])

df


## Advanced Imputation: KNN

In [None]:

from sklearn.impute import KNNImputer

# Re-introduce missing data for demonstration
df_knn = pd.DataFrame({
    'Age': [25, np.nan, 30, 22, np.nan],
    'Income': [50000, 60000, np.nan, np.nan, 70000]
})

knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_knn), columns=df_knn.columns)
df_knn_imputed
