### Handling Missing Values - Imputation within ML Pipelines
**Description**: Implement a machine learning pipeline that includes imputation and a classifier.

In [1]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import unittest

# ---------------------------- Sample Data ----------------------------
data = {
    'Age': [25, np.nan, 35, 40, 29, np.nan, 45],
    'Income': [50000, 60000, 52000, 58000, np.nan, 61000, 54000],
    'Gender': ['Male', 'Female', 'Female', np.nan, 'Male', 'Female', 'Male'],
    'Purchased': [1, 0, 1, 0, 1, 0, 1]
}

df = pd.DataFrame(data)

# ---------------------------- Feature & Target ----------------------------
X = df.drop("Purchased", axis=1)
y = df["Purchased"]

# ---------------------------- Splitting Data ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ---------------------------- Column Types ----------------------------
numeric_features = ['Age', 'Income']
categorical_features = ['Gender']

# ---------------------------- Pipelines ----------------------------

# Numeric pipeline: Imputation + Scaling
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: Imputation + Encoding
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Full ML pipeline with classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# ---------------------------- Fit and Predict ----------------------------
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

# ---------------------------- Unit Tests ----------------------------
class TestPipeline(unittest.TestCase):
    def test_pipeline_prediction_shape(self):
        pred = pipeline.predict(X_test)
        self.assertEqual(pred.shape[0], X_test.shape[0])

    def test_pipeline_handles_missing(self):
        test_data = pd.DataFrame({
            'Age': [np.nan, 35],
            'Income': [60000, np.nan],
            'Gender': ['Female', np.nan]
        })
        pred = pipeline.predict(test_data)
        self.assertEqual(len(pred), 2)

    def test_pipeline_output_range(self):
        pred = pipeline.predict(X_test)
        self.assertTrue(set(pred).issubset({0, 1}))

unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestPipeline))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
...
----------------------------------------------------------------------
Ran 3 tests in 0.046s

OK


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



<unittest.runner.TextTestResult run=3 errors=0 failures=0>