### Scaling & Feature Selection in a Pipeline
**Description**: Create a pipeline that includes feature scaling, variance threshold selection, and a classification model.

In [1]:
# write your code from here
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import unittest

# ----------------- Generate Synthetic Dataset -----------------
X, y = make_classification(n_samples=300, n_features=6, n_informative=4, 
                           n_redundant=1, n_repeated=0, random_state=42)

df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

# ----------------- Error Handling -----------------
def validate_dataset(df, target_col):
    if df.empty:
        raise ValueError("DataFrame is empty.")
    if target_col not in df.columns:
        raise KeyError(f"Target column '{target_col}' not found.")
    if df[target_col].nunique() < 2:
        raise ValueError("Target column must contain at least 2 classes.")

try:
    validate_dataset(df, 'target')
except Exception as e:
    print(f"Validation Error: {e}")

# ----------------- Split Data -----------------
X = df.drop(columns='target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------- Create Pipeline -----------------
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('variance_thresh', VarianceThreshold(threshold=0.1)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# ----------------- Fit & Predict -----------------
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# ----------------- Evaluate -----------------
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# ----------------- Unit Tests -----------------
class TestPipeline(unittest.TestCase):
    def test_pipeline_fits(self):
        pipeline.fit(X_train, y_train)
        self.assertTrue(hasattr(pipeline.named_steps['classifier'], "predict"))

    def test_pipeline_predicts(self):
        y_pred = pipeline.predict(X_test)
        self.assertEqual(len(y_pred), len(y_test))

    def test_accuracy_non_zero(self):
        acc = accuracy_score(y_test, y_pred)
        self.assertGreater(acc, 0.0, "Accuracy should be greater than 0.")

unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestPipeline))


.

Model Accuracy: 0.9167


..
----------------------------------------------------------------------
Ran 3 tests in 0.307s

OK


<unittest.runner.TextTestResult run=3 errors=0 failures=0>