# Data Preprocessing in Machine Learning
This notebook demonstrates key steps in data preprocessing using Python.

## 1. Load Sample Dataset

In [1]:

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load iris dataset
iris = load_iris(as_frame=True)
df = iris.frame

# Introduce issues
df.loc[5:7, "sepal length (cm)"] = np.nan  # missing values
df = pd.concat([df, df.iloc[[0]]], ignore_index=True)  # duplicate
df["random_noise"] = np.random.randint(100, 200, df.shape[0])  # noisy feature
df["category"] = np.where(df["target"] == 0, "A", "B")  # categorical feature

df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,random_noise,category
0,5.1,3.5,1.4,0.2,0,147,A
1,4.9,3.0,1.4,0.2,0,172,A
2,4.7,3.2,1.3,0.2,0,153,A
3,4.6,3.1,1.5,0.2,0,180,A
4,5.0,3.6,1.4,0.2,0,114,A


## 2. Handling Missing Values

In [2]:

from sklearn.impute import SimpleImputer

# Impute numerical missing values with mean
imputer = SimpleImputer(strategy="mean")
df["sepal length (cm)"] = imputer.fit_transform(df[["sepal length (cm)"]])

df.head(10)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,random_noise,category
0,5.1,3.5,1.4,0.2,0,147,A
1,4.9,3.0,1.4,0.2,0,172,A
2,4.7,3.2,1.3,0.2,0,153,A
3,4.6,3.1,1.5,0.2,0,180,A
4,5.0,3.6,1.4,0.2,0,114,A
5,5.855405,3.9,1.7,0.4,0,119,A
6,5.855405,3.4,1.4,0.3,0,141,A
7,5.855405,3.4,1.5,0.2,0,124,A
8,4.4,2.9,1.4,0.2,0,131,A
9,4.9,3.1,1.5,0.1,0,111,A


## 3. Removing Duplicates

In [3]:

print("Before removing duplicates:", df.shape)
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)


Before removing duplicates: (151, 7)
After removing duplicates: (151, 7)


## 4. Removing Outliers (Z-Score Method)

In [4]:

from scipy import stats

# Remove outliers for sepal width
z_scores = np.abs(stats.zscore(df["sepal width (cm)"]))
df = df[(z_scores < 3)]

df.shape


(150, 7)

## 5. Scaling Features

In [5]:

from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
df[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]] = scaler.fit_transform(
    df[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]]
)

df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,random_noise,category
0,-0.925424,1.063547,-1.339182,-1.31097,0,147,A
1,-1.170102,-0.121684,-1.339182,-1.31097,0,172,A
2,-1.414781,0.352409,-1.395991,-1.31097,0,153,A
3,-1.53712,0.115362,-1.282373,-1.31097,0,180,A
4,-1.047763,1.300593,-1.339182,-1.31097,0,114,A


## 6. Encoding Categorical Data

In [6]:

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Label Encoding
le = LabelEncoder()
df["category_label"] = le.fit_transform(df["category"])

# One Hot Encoding
df = pd.get_dummies(df, columns=["category"])

df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,random_noise,category_label,category_A,category_B
0,-0.925424,1.063547,-1.339182,-1.31097,0,147,0,True,False
1,-1.170102,-0.121684,-1.339182,-1.31097,0,172,0,True,False
2,-1.414781,0.352409,-1.395991,-1.31097,0,153,0,True,False
3,-1.53712,0.115362,-1.282373,-1.31097,0,180,0,True,False
4,-1.047763,1.300593,-1.339182,-1.31097,0,114,0,True,False


## 7. Feature Selection

In [7]:

from sklearn.feature_selection import SelectKBest, f_classif

X = df.drop(columns=["target"])
y = df["target"]

selector = SelectKBest(score_func=f_classif, k=3)
X_new = selector.fit_transform(X, y)

print("Selected features shape:", X_new.shape)


Selected features shape: (150, 3)


  f = msb / msw


## 8. Feature Construction

In [8]:

# Create interaction feature
df["sepal_area"] = df["sepal length (cm)"] * df["sepal width (cm)"]
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,random_noise,category_label,category_A,category_B,sepal_area
0,-0.925424,1.063547,-1.339182,-1.31097,0,147,0,True,False,-0.984232
1,-1.170102,-0.121684,-1.339182,-1.31097,0,172,0,True,False,0.142382
2,-1.414781,0.352409,-1.395991,-1.31097,0,153,0,True,False,-0.498581
3,-1.53712,0.115362,-1.282373,-1.31097,0,180,0,True,False,-0.177326
4,-1.047763,1.300593,-1.339182,-1.31097,0,114,0,True,False,-1.362714


## 9. Data Augmentation (Text Example)

In [9]:

import random

sentences = [
    "Machine learning is powerful.",
    "Data preprocessing improves models.",
    "Outliers can reduce accuracy.",
]

# Simple augmentation: synonym replacement / shuffle words
augmented = []
for s in sentences:
    words = s.split()
    random.shuffle(words)
    augmented.append(" ".join(words))

print("Original:", sentences)
print("Augmented:", augmented)


Original: ['Machine learning is powerful.', 'Data preprocessing improves models.', 'Outliers can reduce accuracy.']
Augmented: ['powerful. Machine learning is', 'Data preprocessing models. improves', 'Outliers accuracy. can reduce']


## Conclusion
We demonstrated how to preprocess data step by step: handling missing values, removing duplicates/outliers, scaling, encoding, feature selection, feature construction, and data augmentation.