In [None]:
import catboost
import numpy as np
import pandas as pd
from sklearn import (compose, impute, metrics, model_selection, pipeline,
                     preprocessing)
from sklearn.base import BaseEstimator, TransformerMixin

import creds
import data

# Retrieve and prepare data

In [None]:
df = data.retrieve_data_from_MongoDB({"day_of_retrieval": "2024-02-03"})

In [None]:
X, y = data.preprocess_and_split_data(df)

# Assess preprocessors

In [None]:
X = X.assign(zip_code=lambda df: pd.to_numeric(df.zip_code))

In [None]:
# Extract numerical and categorical feature names
NUMERICAL_FEATURES = X.select_dtypes("number").columns.tolist()
CATEGORICAL_FEATURES = X.select_dtypes("object").columns.tolist()

# Define transformers for preprocessing
numeric_transformer = pipeline.make_pipeline(impute.SimpleImputer(strategy="median"))

categorical_transformer = pipeline.make_pipeline(
    preprocessing.OneHotEncoder(handle_unknown="ignore"),
    impute.SimpleImputer(strategy="median"),
)

# Create a ColumnTransformer to handle both numerical and categorical features
preprocessor = compose.make_column_transformer(
    (numeric_transformer, NUMERICAL_FEATURES),
    (categorical_transformer, CATEGORICAL_FEATURES),
)

# Create a pipeline that includes the preprocessor and the model
model_pipeline = pipeline.make_pipeline(
    preprocessor, catboost.CatBoostRegressor(iterations=10, silent=True)
)

scores = model_selection.cross_val_score(
    estimator=model_pipeline, X=X, y=y, scoring="neg_root_mean_squared_error", cv=10
)
scores

In [None]:
# Custom transformer to enable target encoding within a pipeline


class TargetEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, **te_args):
        self.te_args = te_args
        self.encoder = None

    def fit(self, X, y):
        self.encoder = preprocessing.TargetEncoder(**self.te_args)
        self.encoder.fit(X, y)
        return self

    def transform(self, X):
        return self.encoder.transform(X)


encoders = [
    preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999),
    TargetEncoderWrapper(target_type="continuous"),
]
imputers = [impute.SimpleImputer(strategy="median"), impute.KNNImputer()]
scalers = [preprocessing.StandardScaler(), preprocessing.MinMaxScaler()]

# Extract numerical and categorical feature names
NUMERICAL_FEATURES = X.select_dtypes("number").columns.tolist()
CATEGORICAL_FEATURES = X.select_dtypes("object").columns.tolist()

results = []

# Iterate over the imputers, encoders and scalers
for imputer in imputers:
    for encoder in encoders:
        for scaler in scalers:
            # Define transformers for preprocessing
            numeric_transformer = pipeline.make_pipeline(imputer, scaler)
            categorical_transformer = pipeline.make_pipeline(encoder, imputer)

            # Create a ColumnTransformer to handle both numerical and categorical features
            preprocessor = compose.make_column_transformer(
                (numeric_transformer, NUMERICAL_FEATURES),
                (categorical_transformer, CATEGORICAL_FEATURES),
            )
            model_pipeline = pipeline.make_pipeline(
                preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
            )
            scores = model_selection.cross_val_score(
                estimator=model_pipeline,
                X=X,
                y=y,
                scoring="neg_root_mean_squared_error",
                cv=10,
            )

            result = {
                "imputer": str(imputer),
                "encoder": str(encoder),
                "scaler": str(scaler),
                "scores": scores.tolist(),
                "mean_score": np.mean(scores),
                "std_score": np.std(scores),
            }

            results.append(result)

pd.DataFrame(results).sort_values(by="mean_score", ascending=False)

Best performance : 
* SimpleImputer, OneHotEncoder, MinMaxScaler = 0.099271	0.004467
* SimpleImputer, OneHotEncoder, StandardScaler = 0.099285	0.004499
* SimpleImputer, TargetEncoderWrapper, MinMaxScaler = 0.099382	0.003905

In [None]:
# Define transformers for preprocessing
numeric_transformer = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
)

categorical_transformer = pipeline.make_pipeline(
    preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    impute.SimpleImputer(strategy="median"),
)

# Create a ColumnTransformer to handle both numerical and categorical features
preprocessor = compose.make_column_transformer(
    (numeric_transformer, NUMERICAL_FEATURES),
    (categorical_transformer, CATEGORICAL_FEATURES),
).set_output(transform="pandas")

preprocessor.fit_transform(X)