# Data Preprocessing in Machine Learning

---

## 1. Data Preprocessing Steps

**Pipeline:** Raw Data → Cleaning → Splitting → Normalization → Feature Engineering → Imbalance Handling → Ready for Modeling

In [None]:
import pandas as pd

# Load data from a CSV file
data = pd.read_csv('path/to/your/data.csv')

# Display the first few rows of the dataset
data.head()

: 


---
## 2. Removing Noise, Fixing Errors, and Ensuring Consistency

### 2.1 Handling Missing Data (Imputation vs. Deletion)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
data = pd.read_csv('path/to/your/data.csv')

# Identify missing values
missing_data = data.isnull().sum()
print(missing_data)

# Visualize missing data
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

# Imputation
data_imputed = data.fillna(data.mean())

# Deletion
data_dropped = data.dropna()

# Compare original, imputed, and dropped data
print("Original Data Shape:", data.shape)
print("Imputed Data Shape:", data_imputed.shape)
print("Dropped Data Shape:", data_dropped.shape)



### 2.2 Removing Duplicates


In [None]:
# Remove duplicates
data = data.drop_duplicates()
print("Data Shape after Removing Duplicates:", data.shape)



### 2.3 Fixing Inconsistent Entries


In [None]:
# Fix inconsistent entries
data['city'] = data['city'].replace({'NYC': 'New York City', 'SF': 'San Francisco'})
print(data['city'].unique())


---

## 3. Data Splitting

### 3.1 Using train_test_split from sklearn


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)



---

## 4. Normalization and Standardization

### 4.1 Normalization


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalize data
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)

# Visualize before and after normalization
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(data['numeric_column'], bins=30, kde=True)
plt.title('Before Normalization')
plt.subplot(1, 2, 2)
sns.histplot(normalized_data[:, 0], bins=30, kde=True)
plt.title('After Normalization')
plt.show()



### 4.2 Standardization


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

# Visualize before and after standardization
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(data['numeric_column'], bins=30, kde=True)
plt.title('Before Standardization')
plt.subplot(1, 2, 2)
sns.histplot(standardized_data[:, 0], bins=30, kde=True)
plt.title('After Standardization')
plt.show()



---

## 5. Feature Engineering

### 5.1 Combining Features


In [None]:
# Example: Creating BMI feature
data['BMI'] = data['weight'] / (data['height'] ** 2)
print(data[['weight', 'height', 'BMI']].head())



### 5.2 Encoding Categorical Variables


In [None]:
# One-hot encoding
data = pd.get_dummies(data, columns=['categorical_column'])
print(data.head())



### 5.3 Log Transformation for Skewed Data


In [None]:
# Log transformation
data['log_column'] = np.log(data['skewed_column'] + 1)

# Visualize before and after log transformation
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(data['skewed_column'], bins=30, kde=True)
plt.title('Before Log Transformation')
plt.subplot(1, 2, 2)
sns.histplot(data['log_column'], bins=30, kde=True)
plt.title('After Log Transformation')
plt.show()



---

## 6. Feature Selection

### 6.1 Statistical Methods: Correlation Matrix


In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()



### 6.2 Algorithmic Methods: Recursive Feature Elimination (RFE)


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# RFE
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=10)
selected_features = rfe.fit_transform(X, y)
print("Selected Features Shape:", selected_features.shape)



### 6.3 Examples using Python Tools (SelectKBest, RFE)


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# SelectKBest
selector = SelectKBest(f_classif, k=10)
selected_features = selector.fit_transform(X, y)
print("Selected Features Shape:", selected_features.shape)



---

## 7. Handling Imbalanced Data

### 7.1 Sampling Techniques: Oversampling (SMOTE) and Undersampling


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Undersampling
undersampler = RandomUnderSampler()
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Visualize class distribution before and after resampling
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(x=y)
plt.title('Before Resampling')
plt.subplot(1, 2, 2)
sns.countplot(x=y_resampled)
plt.title('After Resampling')
plt.show()



---