    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [2]:
pip install pytest


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Task: Imputation Function

import pytest
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Test for Imputation
def test_imputation():
    # Sample data with missing values
    data = {'A': [1, 2, np.nan, 4], 'B': [5, np.nan, 7, 8]}
    df = pd.DataFrame(data)
    
    imputer = SimpleImputer(strategy='mean')
    imputed_data = imputer.fit_transform(df)
    
    # Assert that missing values are replaced
    assert not np.any(np.isnan(imputed_data)), "Missing values exist after imputation!"

# Test for Scaling
def test_scaling():
    # Sample data
    data = {'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}
    df = pd.DataFrame(data)
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    
    # Assert that the scaled data has mean=0 and std=1 for each column
    assert np.abs(np.mean(scaled_data[:, 0])) < 0.1, "Column A mean is not close to 0"  # Column A
    assert np.abs(np.mean(scaled_data[:, 1])) < 0.1, "Column B mean is not close to 0"  # Column B
    assert np.abs(np.std(scaled_data[:, 0]) - 1) < 0.1, "Column A std is not close to 1"
    assert np.abs(np.std(scaled_data[:, 1]) - 1) < 0.1, "Column B std is not close to 1"






# Scaling Function









# Combined Transformation Function









In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Imputation Function
def impute_data(df, strategy='mean'):
    for column in df.columns:
        if df[column].isna().sum() == len(df):
            df[column] = 0  # Or replace with df[column].median() or another placeholder
    imputer = SimpleImputer(strategy=strategy)
    return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Scaling Function
def scale_data(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# DataFrame Validation Function
def validate_dataframe(df):
    if df.empty:
        raise ValueError("DataFrame is empty")
    if not np.issubdtype(df.dtypes[0], np.number):  # Check if data is numeric
        raise TypeError("DataFrame contains non-numeric data. Only numeric data is allowed.")

# Combined Transformation Function
def transform_data(df, imputation_strategy='mean'):
    validate_dataframe(df)
    df = impute_data(df, strategy=imputation_strategy)
    df = scale_data(df)
    return df

# Example Usage
data = {'A': [1, 2, np.nan, 4], 'B': [5, np.nan, 7, 8]}
df = pd.DataFrame(data)

transformed_df = transform_data(df, imputation_strategy='median')
print(transformed_df)

          A         B
0 -1.147079 -1.605910
1 -0.229416  0.229416
2 -0.229416  0.229416
3  1.605910  1.147079


  if not np.issubdtype(df.dtypes[0], np.number):  # Check if data is numeric
