In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import os
import unittest

# Generate synthetic dataset
def generate_data():
    np.random.seed(42)
    df = pd.DataFrame({
        'feature1': np.random.randn(100),
        'feature2': np.random.choice([1, 2, np.nan], 100),
        'feature3': np.random.choice(['A', 'B', 'C'], 100),
        'target': np.random.choice([0, 1], 100)
    })
    df.loc[::10, 'feature1'] = np.nan  # introduce NaNs
    return df

# Create pipeline
def create_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ])

# Train and save pipeline
def train_and_save_pipeline(X_train, y_train, model_path='pipeline.pkl'):
    if X_train.empty or y_train.empty:
        raise ValueError("Training data or labels are empty.")
    pipeline = create_pipeline()
    pipeline.fit(X_train, y_train)
    joblib.dump(pipeline, model_path)
    return pipeline

# Load pipeline and predict
def load_and_predict(X, model_path='pipeline.pkl'):
    if not os.path.exists(model_path):
        raise FileNotFoundError("Saved model not found.")
    pipeline = joblib.load(model_path)
    return pipeline.predict(X)

# Unit tests
class TestPipeline(unittest.TestCase):
    def setUp(self):
        self.df = generate_data()
        self.X = self.df[['feature1', 'feature2']]
        self.y = self.df['target']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.model_path = 'test_pipeline.pkl'
        train_and_save_pipeline(self.X_train, self.y_train, self.model_path)

    def test_pipeline_training(self):
        pipeline = joblib.load(self.model_path)
        self.assertTrue(hasattr(pipeline, 'predict'))

    def test_pipeline_prediction(self):
        preds = load_and_predict(self.X_test, self.model_path)
        self.assertEqual(len(preds), len(self.X_test))

    def test_invalid_input(self):
        with self.assertRaises(ValueError):
            train_and_save_pipeline(pd.DataFrame(), pd.Series(dtype='int'), self.model_path)

# Main execution
if __name__ == '__main__':
    df = generate_data()
    X = df[['feature1', 'feature2']]
    y = df['target']

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and save pipeline
    pipeline = train_and_save_pipeline(X_train, y_train)

    # Load and predict
    predictions = load_and_predict(X_test)

    # Evaluate
    print("Accuracy:", accuracy_score(y_test, predictions))

    # Run tests
    unittest.main(argv=[''], exit=False)


...
----------------------------------------------------------------------
Ran 3 tests in 0.053s

OK


Accuracy: 0.55
