# Pandas Data Cleaning â€“ Complete Step-by-Step Notebook

This notebook demonstrates **end-to-end data cleaning using Pandas** for Data Analysis.

---

In [1]:
import pandas as pd
import numpy as np

## 1. Load Sample Dataset

In [2]:

data = {
    "Order_ID": [1001,1002,1003,1004,1005,1005],
    "Customer": ["Alice","Bob","Charlie",None,"Eve","Eve"],
    "Category": ["Electronics","Electronics","Furniture","Furniture","Electronics","Electronics"],
    "Price": [800,500,None,250,800,800],
    "Quantity": [2,1,4,1,None,None],
    "Order_Date": ["2024-01-01","2024-01-02","2024-01-03","invalid","2024-01-05","2024-01-05"]
}
df = pd.DataFrame(data)
df


Unnamed: 0,Order_ID,Customer,Category,Price,Quantity,Order_Date
0,1001,Alice,Electronics,800.0,2.0,2024-01-01
1,1002,Bob,Electronics,500.0,1.0,2024-01-02
2,1003,Charlie,Furniture,,4.0,2024-01-03
3,1004,,Furniture,250.0,1.0,invalid
4,1005,Eve,Electronics,800.0,,2024-01-05
5,1005,Eve,Electronics,800.0,,2024-01-05


## 2. Initial Inspection

In [3]:

df.head()
df.info()
df.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Order_ID    6 non-null      int64  
 1   Customer    5 non-null      object 
 2   Category    6 non-null      object 
 3   Price       5 non-null      float64
 4   Quantity    4 non-null      float64
 5   Order_Date  6 non-null      object 
dtypes: float64(2), int64(1), object(3)
memory usage: 420.0+ bytes


Unnamed: 0,Order_ID,Customer,Category,Price,Quantity,Order_Date
count,6.0,5,6,5.0,4.0,6
unique,,4,2,,,5
top,,Eve,Electronics,,,2024-01-05
freq,,2,4,,,2
mean,1003.333333,,,630.0,2.0,
std,1.632993,,,248.997992,1.414214,
min,1001.0,,,250.0,1.0,
25%,1002.25,,,500.0,1.0,
50%,1003.5,,,800.0,1.5,
75%,1004.75,,,800.0,2.5,


## 3. Missing Values Detection

In [13]:
(df.isnull().sum() / len(df)) * 100


Order_ID      0.0
Customer      0.0
Category      0.0
Price         0.0
Quantity      0.0
Order_Date    0.0
dtype: float64

## 4. Remove Missing Data

In [11]:

df_drop_rows = df.dropna()
df_drop_cols = df.dropna(axis=1)


## 5. Fill Missing Values

In [12]:

df['Price'] = df['Price'].fillna(df['Price'].median())
df['Quantity'] = df['Quantity'].fillna(df['Quantity'].mean())
df['Customer'] = df['Customer'].fillna(df['Customer'].mode()[0])
df


Unnamed: 0,Order_ID,Customer,Category,Price,Quantity,Order_Date
0,1001,Alice,Electronics,800.0,2.0,2024-01-01
1,1002,Bob,Electronics,500.0,1.0,2024-01-02
2,1003,Charlie,Furniture,800.0,4.0,2024-01-03
3,1004,Eve,Furniture,250.0,1.0,invalid
4,1005,Eve,Electronics,800.0,2.0,2024-01-05
5,1005,Eve,Electronics,800.0,2.0,2024-01-05


## 6. Group-wise Fill

In [14]:

df['Price'] = df.groupby('Category')['Price'].transform(lambda x: x.fillna(x.mean()))


## 7. Data Type Fixing

In [None]:

df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
df.dtypes


## 8. Duplicate Handling

In [None]:

df.duplicated().sum()
df = df.drop_duplicates()


## 9. Text Cleaning

In [None]:

df['Customer'] = df['Customer'].str.strip().str.title()


## 10. Column Renaming

In [None]:

df.rename(columns={'Order_ID':'order_id','Order_Date':'order_date'}, inplace=True)
df.columns


## 11. Final Validation

In [None]:

df.isnull().sum()
df.info()
df


## 12. Export Clean Data

In [None]:

df.to_csv("clean_data.csv", index=False)
