In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

from custom_transformers import CombinedAttributesAdder  # your class here

# Load dataset
housing = pd.read_csv("housing.csv")

# Drop target column
housing_labels = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

# Separate numerical and categorical columns
num_attribs = housing.select_dtypes(include=[np.number]).columns.tolist()
cat_attribs = housing.select_dtypes(include=[object]).columns.tolist()

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

# Full pipeline
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

# Final pipeline with estimator
final_pipeline = Pipeline([
    ("preprocessor", full_pipeline),
    ("lin_reg", LinearRegression())
])

# Train
final_pipeline.fit(housing, housing_labels)

# Save the pipeline
joblib.dump(final_pipeline, "pipeline.pkl")
print("Pipeline saved successfully!")


Pipeline saved successfully!
