In [None]:
# Data Cleaning Techniques

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
# Sample Dataset

In [3]:
data = {
    "Age": [25, 30, None, 45, 30, 25, 100],
    "Salary": [50000, 60000, 55000, None, 60000, 50000, 1000000],
    "Gender": ["Male", "Female", "female", "Male", "Female", "Male", "Male"],
    "City": ["Mumbai", "Delhi", "Mumbai", "Delhi", "Delhi", "Mumbai", "Mumbai"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Age,Salary,Gender,City
0,25.0,50000.0,Male,Mumbai
1,30.0,60000.0,Female,Delhi
2,,55000.0,female,Mumbai
3,45.0,,Male,Delhi
4,30.0,60000.0,Female,Delhi
5,25.0,50000.0,Male,Mumbai
6,100.0,1000000.0,Male,Mumbai


In [4]:
# Missing Data Handling

In [5]:
df.isnull().sum()

Age       1
Salary    1
Gender    0
City      0
dtype: int64

In [6]:
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Salary"] = df["Salary"].fillna(df["Salary"].median())


In [7]:
df.duplicated()

0    False
1    False
2    False
3    False
4     True
5     True
6    False
dtype: bool

In [8]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Age,Salary,Gender,City
0,25.0,50000.0,Male,Mumbai
1,30.0,60000.0,Female,Delhi
2,42.5,55000.0,female,Mumbai
3,45.0,57500.0,Male,Delhi
6,100.0,1000000.0,Male,Mumbai


In [9]:
# Outlier Detection

In [10]:
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1

df = df[
    (df["Salary"] >= Q1 - 1.5 * IQR) &
    (df["Salary"] <= Q3 + 1.5 * IQR)
].copy()
df


Unnamed: 0,Age,Salary,Gender,City
0,25.0,50000.0,Male,Mumbai
1,30.0,60000.0,Female,Delhi
2,42.5,55000.0,female,Mumbai
3,45.0,57500.0,Male,Delhi


In [11]:
# Inconsistent Formatting

In [12]:
df["Gender"] = df["Gender"].str.capitalize()
df


Unnamed: 0,Age,Salary,Gender,City
0,25.0,50000.0,Male,Mumbai
1,30.0,60000.0,Female,Delhi
2,42.5,55000.0,Female,Mumbai
3,45.0,57500.0,Male,Delhi


In [13]:
# Encoding Categorical Variables

In [14]:
le = LabelEncoder()
df["Gender_encoded"] = le.fit_transform(df["Gender"])
df


Unnamed: 0,Age,Salary,Gender,City,Gender_encoded
0,25.0,50000.0,Male,Mumbai,1
1,30.0,60000.0,Female,Delhi,0
2,42.5,55000.0,Female,Mumbai,0
3,45.0,57500.0,Male,Delhi,1


In [15]:
# Scaling Numerical Features

In [16]:
scaler = StandardScaler()
df[["Age_scaled", "Salary_scaled"]] = scaler.fit_transform(df[["Age", "Salary"]])
df


Unnamed: 0,Age,Salary,Gender,City,Gender_encoded,Age_scaled,Salary_scaled
0,25.0,50000.0,Male,Mumbai,1,-1.27064,-1.521278
1,30.0,60000.0,Female,Delhi,0,-0.672692,1.183216
2,42.5,55000.0,Female,Mumbai,0,0.822179,-0.169031
3,45.0,57500.0,Male,Delhi,1,1.121153,0.507093


In [17]:
# Feature Engineering Basics

In [18]:
df["Salary_per_Age"] = df["Salary"] / df["Age"]
df


Unnamed: 0,Age,Salary,Gender,City,Gender_encoded,Age_scaled,Salary_scaled,Salary_per_Age
0,25.0,50000.0,Male,Mumbai,1,-1.27064,-1.521278,2000.0
1,30.0,60000.0,Female,Delhi,0,-0.672692,1.183216,2000.0
2,42.5,55000.0,Female,Mumbai,0,0.822179,-0.169031,1294.117647
3,45.0,57500.0,Male,Delhi,1,1.121153,0.507093,1277.777778
