In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# Generate synthetic dataset
def create_dataset(seed=42):
    np.random.seed(seed)
    data = pd.DataFrame({
        'age': [25, np.nan, 28, 35, np.nan, 40],
        'income': [50000, 60000, np.nan, 80000, 75000, np.nan],
        'gender': ['male', 'female', 'female', 'male', np.nan, 'female'],
        'target': [0, 1, 0, 1, 0, 1]
    })
    return data

# Shared preprocessing pipeline
def get_preprocessing_pipeline():
    numeric_features = ['age', 'income']
    categorical_features = ['gender']

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    return preprocessor

# Train pipeline with model
def train_pipeline(data):
    X = data.drop(columns='target')
    y = data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    preprocessor = get_preprocessing_pipeline()
    clf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    clf_pipeline.fit(X_train, y_train)
    y_pred = clf_pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Training Accuracy: {acc:.2f}")
    return clf_pipeline

# Save and load functions
def save_model(model, path='model_pipeline.pkl'):
    joblib.dump(model, path)

def load_model(path='model_pipeline.pkl'):
    return joblib.load(path)

# Demonstration
train_data = create_dataset()
model_pipeline = train_pipeline(train_data)
save_model(model_pipeline)

# Simulate inference
inference_data = create_dataset(seed=99).drop(columns='target')
loaded_pipeline = load_model()
predictions = loaded_pipeline.predict(inference_data)
print("Predictions on inference data:", predictions)


Training Accuracy: 0.50
Predictions on inference data: [0 0 0 1 0 1]
