## Automating Data Cleaning in Python

    Task: Basic Pipeline with Scaling
1. Objective: Create a pipeline that scales numerical features in a dataset.
2. Steps:
    - Load a sample dataset with Pandas.
    - Define a pipeline using Pipeline from sklearn.pipeline .
    - Use StandardScaler to scale features.

In [1]:
# Write your code from here
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import unittest

# --- Helper Functions ---
def check_empty_dataframe(df):
    """Check if the DataFrame is empty."""
    if df.empty:
        raise ValueError("The input dataset is empty")

def create_pipeline(scaling=True, imputation=True):
    """Create a preprocessing pipeline that includes scaling and/or imputation."""
    steps = []
    if imputation:
        steps.append(('imputer', SimpleImputer(strategy='mean')))
    if scaling:
        steps.append(('scaler', StandardScaler()))
    return Pipeline(steps)

# --- Example Dataset ---
# Load Iris dataset
from sklearn.datasets import load_iris
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Introduce missing values for demonstration
df.iloc[0, 0] = np.nan  # Missing value in Feature1

# --- Data Preprocessing ---
# Check if dataset is empty before processing
check_empty_dataframe(df)

# Create the preprocessing pipeline (with both scaling and imputation)
pipeline = create_pipeline(scaling=True, imputation=True)

# Fit and transform the dataset using the pipeline
processed_data = pipeline.fit_transform(df)

# Convert the processed data back to DataFrame
processed_df = pd.DataFrame(processed_data, columns=df.columns)

# Show the processed dataset
print("Processed Data (Imputed and Scaled):")
print(processed_df.head())

# --- Unit Tests ---
class TestDataPipeline(unittest.TestCase):
    def setUp(self):
        """Prepare a sample dataset with missing values for testing."""
        self.data = load_iris()
        self.df = pd.DataFrame(self.data.data, columns=self.data.feature_names)
        self.df.iloc[0, 0] = np.nan  # Introduce a missing value

    def test_scaling_pipeline(self):
        """Test if the scaling pipeline works correctly."""
        pipeline = create_pipeline(scaling=True, imputation=False)
        scaled_data = pipeline.fit_transform(self.df)
        self.assertEqual(scaled_data.shape, self.df.shape)  # Check if the shape remains the same

    def test_imputation_pipeline(self):
        """Test if the imputation pipeline works correctly."""
        pipeline = create_pipeline(scaling=False, imputation=True)
        imputed_data = pipeline.fit_transform(self.df)
        self.assertFalse(np.any(np.isnan(imputed_data)))  # Check if NaNs are filled

    def test_empty_dataframe(self):
        """Test if an error is raised when an empty dataframe is provided."""
        empty_df = pd.DataFrame()
        with self.assertRaises(ValueError):
            check_empty_dataframe(empty_df)

# Run the unit tests
if __name__ == '__main__':
    unittest.main()

Processed Data (Imputed and Scaled):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.000000          1.019004          -1.340227         -1.315444
1          -1.152203         -0.131979          -1.340227         -1.315444
2          -1.395201          0.328414          -1.397064         -1.315444
3          -1.516700          0.098217          -1.283389         -1.315444
4          -1.030704          1.249201          -1.340227         -1.315444


usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/vscode/.local/share/jupyter/runtime/kernel-v3afbdb9fd34af9c3e2bb4069e45a0b38baf1edef7.json'


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


    Task: Pipeline with Imputation
1. Objective: Automate data cleaning by handling missing values.
2. Steps:
    - Load a dataset with missing values.
    - Define a pipeline to use SimpleImputer for filling missing values.

In [2]:
# Write your code from here
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Step 1: Load a dataset with missing values
# Creating a sample dataset with missing values for illustration
data = {'Feature1': [1, 2, 3, None, 5],
        'Feature2': [None, 2, None, 4, 5]}

df = pd.DataFrame(data)

# Step 2: Define a pipeline
# Here, we will apply SimpleImputer to fill missing values with the mean
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Step 3: Fit and transform the data using the pipeline
imputed_data = pipeline.fit_transform(df)

# Step 4: Show the imputed data
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)
print(imputed_df)

   Feature1  Feature2
0      1.00  3.666667
1      2.00  2.000000
2      3.00  3.666667
3      2.75  4.000000
4      5.00  5.000000
