In [None]:
import itertools

import catboost
import numpy as np
import pandas as pd
from sklearn import (compose, feature_selection, impute, metrics,
                     model_selection, pipeline, preprocessing)
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

import creds
import data

# Retrieve and prepare data

In [None]:
df = data.retrieve_data_from_MongoDB({"day_of_retrieval": "2024-02-03"})

In [None]:
X, y = data.preprocess_and_split_data(df)

# Assess preprocessors

In [None]:
X = X.assign(zip_code=lambda df: pd.to_numeric(df.zip_code))

# Extract numerical and categorical feature names
NUMERICAL_FEATURES = X.select_dtypes("number").columns.tolist()
CATEGORICAL_FEATURES = X.select_dtypes("object").columns.tolist()

In [None]:
# # Define transformers for preprocessing
# numeric_transformer = pipeline.make_pipeline(impute.SimpleImputer(strategy="median"))

# categorical_transformer = pipeline.make_pipeline(
#     preprocessing.OneHotEncoder(handle_unknown="ignore"),
#     impute.SimpleImputer(strategy="median"),
# )

# # Create a ColumnTransformer to handle both numerical and categorical features
# preprocessor = compose.make_column_transformer(
#     (numeric_transformer, NUMERICAL_FEATURES),
#     (categorical_transformer, CATEGORICAL_FEATURES),
# )

# # Create a pipeline that includes the preprocessor and the model
# model_pipeline = pipeline.make_pipeline(
#     preprocessor, catboost.CatBoostRegressor(iterations=10, silent=True)
# )

# scores = model_selection.cross_val_score(
#     estimator=model_pipeline, X=X, y=y, scoring="neg_root_mean_squared_error", cv=10
# )
# scores

In [None]:
# Custom transformer to enable target encoding within a pipeline


class TargetEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, **te_args):
        self.te_args = te_args
        self.encoder = None

    def fit(self, X, y):
        self.encoder = preprocessing.TargetEncoder(**self.te_args)
        self.encoder.fit(X, y)
        return self

    def transform(self, X):
        return self.encoder.transform(X)


encoders = [
    preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999),
    TargetEncoderWrapper(target_type="continuous"),
]
imputers = [impute.SimpleImputer(strategy="median"), impute.KNNImputer()]
scalers = [
    preprocessing.StandardScaler(),
    preprocessing.MinMaxScaler(),
    preprocessing.RobustScaler(),
]

# Extract numerical and categorical feature names
NUMERICAL_FEATURES = X.select_dtypes("number").columns.tolist()
CATEGORICAL_FEATURES = X.select_dtypes("object").columns.tolist()

results = []

# Iterate over the imputers, encoders and scalers
for imputer in imputers:
    for encoder in encoders:
        for scaler in scalers:
            # Define transformers for preprocessing
            numeric_transformer = pipeline.make_pipeline(imputer, scaler)
            categorical_transformer = pipeline.make_pipeline(encoder, imputer)

            # Create a ColumnTransformer to handle both numerical and categorical features
            preprocessor = compose.make_column_transformer(
                (numeric_transformer, NUMERICAL_FEATURES),
                (categorical_transformer, CATEGORICAL_FEATURES),
            )
            model_pipeline = pipeline.make_pipeline(
                preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
            )
            scores = model_selection.cross_val_score(
                estimator=model_pipeline,
                X=X,
                y=y,
                scoring="neg_root_mean_squared_error",
                cv=10,
            )

            result = {
                "imputer": str(imputer),
                "encoder": str(encoder),
                "scaler": str(scaler),
                "scores": scores.tolist(),
                "mean_score": np.mean(scores),
                "std_score": np.std(scores),
            }

            results.append(result)

pd.DataFrame(results).sort_values(by="mean_score", ascending=False)

**Best performances :**

| List of preprocessors  | AVG RMSE on Validation set | STD of RMSEs on Validation set |
| :---------------- | :------: | :----: |
| SimpleImputer, OneHotEncoder, MinMaxScaler | 0.099271 | 0.004467  |
| SimpleImputer, OneHotEncoder, StandardScaler |  0.099285	 | 0.004499  |
| SimpleImputer, TargetEncoderWrapper, MinMaxScaler | 0.099382 |  0.003905 |


Based on this experiment we can use the `SimpleImputer` with median strategy and `MinMaxScaler` for *numerical features* and `OneHotEncoder` (handle_unknown set to "ignore" and sparse_output to False) and `SimpleImputer` with median strategy for *non-numerical columns*.

# Feature engineering
## Utilize categorical columns for grouping and transform each numerical variable based on the mean

In [None]:
def FE_categorical_transform(
    X: pd.DataFrame, y: pd.Series, transform_type: str = "mean"
) -> pd.DataFrame:
    """
    This function performs feature engineering on a given DataFrame by transforming categorical features
    based on a specified transformation type (mean, sum, min, max) applied to numerical features.
    The transformed features are then used to train a CatBoostRegressor model and the performance is evaluated
    using cross-validation. The results are returned as a DataFrame sorted by mean out-of-fold scores.

    Parameters:
    X (pd.DataFrame): The input DataFrame containing the features.
    y (pd.Series): The target variable series.
    transform_type (str, optional): The type of transformation to apply to each group within the categorical column.
                                    It can be one of the following: 'mean', 'sum', 'min', 'max'. Default is 'mean'.

    Raises:
    ValueError: If the transform_type is not one of the following: 'mean', 'sum', 'min', 'max'.

    Returns:
    pd.DataFrame: A DataFrame containing the categorical feature, numerical feature, mean out-of-fold scores,
                  and standard deviation of out-of-fold scores, sorted by mean out-of-fold scores in descending order.
    """
    if transform_type not in ["mean", "sum", "min", "max"]:
        raise ValueError(f"Invalid transform_type: {transform_type}")

    results = []
    for categorical in tqdm(CATEGORICAL_FEATURES, desc="Progress"):
        for numerical in NUMERICAL_FEATURES:
            # Create a deep copy of the input data
            temp = X.copy(deep=True)

            # Calculate the transformation for each group within the categorical column
            temp["new_column"] = temp.groupby(categorical)[numerical].transform(
                transform_type
            )

            new_numerical_features = temp.select_dtypes("number").columns.tolist()
            new_categorical_features = temp.select_dtypes("object").columns.tolist()

            # Define transformers for preprocessing
            numeric_transformer = pipeline.make_pipeline(
                impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
            )

            categorical_transformer = pipeline.make_pipeline(
                preprocessing.OneHotEncoder(
                    handle_unknown="ignore", sparse_output=False
                ),
                impute.SimpleImputer(strategy="median"),
            )

            # Create a ColumnTransformer to handle both numerical and categorical features
            preprocessor = compose.make_column_transformer(
                (numeric_transformer, new_numerical_features),
                (categorical_transformer, new_categorical_features),
            ).set_output(transform="pandas")

            model_pipeline = pipeline.make_pipeline(
                preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
            )

            scores = model_selection.cross_val_score(
                estimator=model_pipeline,
                X=temp,
                y=y,
                scoring="neg_root_mean_squared_error",
                cv=10,
            )

            # Store the results as a tuple
            result = (categorical, numerical, np.mean(scores), np.std(scores))
            results.append(result)

            del temp, new_numerical_features, new_categorical_features, scores, result

    # Create a DataFrame from the results and sort it by mean OOF scores
    result_df = pd.DataFrame(
        results,
        columns=["categorical_feature", "numerical_feature", "mean_OOFs", "std_OOFs"],
    )
    result_df = result_df.sort_values(by="mean_OOFs", ascending=False)
    return result_df

In [None]:
%%script echo skipping

df_FE_categorical_transform = FE_categorical_transform(X, y)
df_FE_categorical_transform

**Best performances :**

| categorical_feature  | numerical_feature | mean_OOFs | std_OOFs|
| :---------------- | :------: | :----: | :----: |
| province | living_area | -0.099005  | 0.004469 |
| heating_type | living_area | -0.099054  | 0.003247 |
| building_condition | primary_energy_consumption | -0.099086  |0.003556 |


The best result was obtained by taking the `province` feature as categorical variable and calculating the mean of `living_area`. The resulting OOF RMSE is *0.099005* which is slightly better than our base model (*0.099271*).


## Generate bins from the continuous variables


In [None]:
def FE_continuous_transform(
    X: pd.DataFrame, y: pd.Series, transform_type: str = "mean"
) -> pd.DataFrame:
    """
    This function performs feature engineering on a given DataFrame by transforming continuous features
    based on a specified transformation type (mean, sum, min, max) applied to other continuous features.
    The transformed features are then used to train a CatBoostRegressor model and the performance is evaluated
    using cross-validation. The results are returned as a DataFrame sorted by mean out-of-fold scores.

    Parameters:
    X (pd.DataFrame): The input DataFrame containing the features.
    y (pd.Series): The target variable series.
    transform_type (str, optional): The type of transformation to apply to each group within the discretized continuous column.
                                    It can be one of the following: 'mean', 'sum', 'min', 'max'. Default is 'mean'.

    Raises:
    ValueError: If the transform_type is not one of the following: 'mean', 'sum', 'min', 'max'.

    Returns:
    pd.DataFrame: A DataFrame containing the discretized continuous feature, transformed continuous feature, mean out-of-fold scores,
                  and standard deviation of out-of-fold scores, sorted by mean out-of-fold scores in descending order.
    """

    if transform_type not in ["mean", "sum", "min", "max"]:
        raise ValueError(f"Invalid transform_type: {transform_type}")

    results = []

    # Get a list of continuous and numerical columns
    continuous_columns = [
        "zip_code",
        "primary_energy_consumption",
        "living_area",
        "surface_of_the_plot",
        "construction_year",
    ]
    optimal_bins = int(np.floor(np.log2(X.shape[0])) + 1)

    # Combine the loops to have a single progress bar
    for discretized_continuous in tqdm(continuous_columns, desc="Progress:"):
        for transformed_continuous in continuous_columns:
            if discretized_continuous != transformed_continuous:
                # Create a deep copy of the input data
                temp = X.copy(deep=True)

                discretizer = pipeline.make_pipeline(
                    impute.SimpleImputer(strategy="median"),
                    preprocessing.KBinsDiscretizer(
                        encode="ordinal", n_bins=optimal_bins
                    ),
                )

                temp[discretized_continuous] = discretizer.fit_transform(
                    X[[discretized_continuous]]
                )

                # Calculate the transformation for each group within the categorical column
                temp["new_column"] = temp.groupby(discretized_continuous)[
                    transformed_continuous
                ].transform(transform_type)

                new_numerical_features = temp.select_dtypes("number").columns.tolist()
                new_categorical_features = temp.select_dtypes("object").columns.tolist()

                # Define transformers for preprocessing
                numeric_transformer = pipeline.make_pipeline(
                    impute.SimpleImputer(strategy="median"),
                    preprocessing.MinMaxScaler(),
                )

                categorical_transformer = pipeline.make_pipeline(
                    preprocessing.OneHotEncoder(
                        handle_unknown="ignore", sparse_output=False
                    ),
                    impute.SimpleImputer(strategy="median"),
                )

                # Create a ColumnTransformer to handle both numerical and categorical features
                preprocessor = compose.make_column_transformer(
                    (numeric_transformer, new_numerical_features),
                    (categorical_transformer, new_categorical_features),
                ).set_output(transform="pandas")

                model_pipeline = pipeline.make_pipeline(
                    preprocessor,
                    catboost.CatBoostRegressor(iterations=100, silent=True),
                )

                scores = model_selection.cross_val_score(
                    estimator=model_pipeline,
                    X=temp,
                    y=y,
                    scoring="neg_root_mean_squared_error",
                    cv=10,
                )

                # Store the results as a tuple
                result = (
                    discretized_continuous,
                    transformed_continuous,
                    np.mean(scores),
                    np.std(scores),
                )
                results.append(result)

                del temp, scores, result

    # Create a DataFrame from the results and sort it by mean OOF scores
    result_df = pd.DataFrame(
        results,
        columns=[
            "discretized_continuous",
            "transformed_continuous",
            "mean_OOFs",
            "std_OOFs",
        ],
    )
    result_df = result_df.sort_values(by="mean_OOFs", ascending=False)
    return result_df

In [None]:
%%script echo skipping

df_FE_continuous_transform = FE_continuous_transform(X, y)
df_FE_continuous_transform

**Best performances :**

| discretized_continuous  | transformed_continuous | mean_OOFs | std_OOFs|
| :---------------- | :------: | :----: | :----: |
| primary_energy_consumption | zip_code | -0.099286	  |0.0034879 |
| primary_energy_consumption | living_area | -0.099351  | 0.003779 |
| primary_energy_consumption | surface_of_the_plot | -0.099562  |0.003809 |


The best result was obtained by taking the `0.003809` feature as discretized continuous variable and calculating the mean of `zip_code`. The resulting OOF RMSE is *0.099286* which is slightly worse than our base model (*0.099271*).


## Introduce polynomial features


In [None]:
def FE_polynomial_features(
    X: pd.DataFrame, y: pd.Series, combinations: int = 1
) -> pd.DataFrame:
    results = []

    # Get a list of continuous and numerical columns
    numerical_columns = X.select_dtypes("number").columns

    # Combine the loops to have a single progress bar
    for numerical_col in tqdm(
        list(itertools.combinations(numerical_columns, r=combinations))
    ):
        polyfeatures = compose.make_column_transformer(
            (
                pipeline.make_pipeline(
                    impute.SimpleImputer(strategy="median"),
                    preprocessing.PolynomialFeatures(
                        interaction_only=False, include_bias=False
                    ),
                ),
                list(numerical_col),
            ),
            remainder="passthrough",
        ).set_output(transform="pandas")

        temp = polyfeatures.fit_transform(X)

        new_numerical_features = temp.select_dtypes("number").columns.tolist()
        new_categorical_features = temp.select_dtypes("object").columns.tolist()

        # Define transformers for preprocessing
        numeric_transformer = pipeline.make_pipeline(
            impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
        )

        categorical_transformer = pipeline.make_pipeline(
            preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            impute.SimpleImputer(strategy="median"),
        )

        # Create a ColumnTransformer to handle both numerical and categorical features
        preprocessor = compose.make_column_transformer(
            (numeric_transformer, new_numerical_features),
            (categorical_transformer, new_categorical_features),
        ).set_output(transform="pandas")

        model_pipeline = pipeline.make_pipeline(
            preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
        )

        scores = model_selection.cross_val_score(
            estimator=model_pipeline,
            X=temp,
            y=y,
            scoring="neg_root_mean_squared_error",
            cv=10,
        )

        # Store the results as a tuple
        result = (numerical_col, np.mean(scores), np.std(scores))
        results.append(result)

        del temp, scores, result

    result_df = pd.DataFrame(
        results, columns=["numerical_col", "mean_OOFs", "std_OOFs"]
    )
    result_df = result_df.sort_values(by="mean_OOFs", ascending=False)
    return result_df

In [None]:
%%script echo skipping
FE_polynomial_features(X,y)

**Best performances :**

| numerical_col  | mean_OOFs | std_OOFs |
| :---------------- | :------: | :----: |
| zip_code | -0.098392	 | 0.004372	  |
| bedrooms | -0.098816	 | 0.003782 |
| primary_energy_consumption | -0.098900| 0.004057 |


The best result was obtained by taking the polynomial feature of the `zip_code`. The resulting OOF RMSE is *0.098392* which is slightly better than our base model (*0.099271*).

## Implement other ideas derived from empirical observations or assumptions

In [None]:
def FE_ideas(X):
    """Performs additional feature engineering on the input DataFrame.

    Args:
        X (pd.DataFrame): The input DataFrame containing the original features.

    Returns:
        pd.DataFrame: A DataFrame with additional engineered features.

    Example:
        >>> engineered_data = FE_ideas(original_data)
    """
    temp = X.assign(
        energy_efficiency=lambda df: df.primary_energy_consumption / df.living_area,
        total_rooms=lambda df: df.bathrooms + df.bedrooms,
        bedroom_to_bathroom=lambda df: df.bedrooms / df.bathrooms,
        area_per_room=lambda df: df.living_area / df.bedrooms,
        plot_to_livings_area=lambda df: df.surface_of_the_plot / df.living_area,
    )
    return temp.loc[:, "energy_efficiency":]


FE_ideas(X)

In [None]:
def FE_try_ideas(
    X: pd.DataFrame,
    y: pd.Series,
) -> pd.DataFrame:
    """Performs feature engineering experiments by adding new features and evaluating their impact on model performance.

    Args:
        X (pd.DataFrame): The input feature matrix.
        y (pd.Series): The target variable.

    Returns:
        pd.DataFrame: A DataFrame containing the results of feature engineering experiments.

    Example:
        >>> results_df = FE_try_ideas(X, y)
    """
    # Initialize a list to store results
    results = []

    # Apply additional feature engineering ideas
    feature_df = FE_ideas(X)

    for feature in tqdm(feature_df.columns):
        # Concatenate the original features with the newly engineered feature
        temp = pd.concat([X, feature_df[feature]], axis="columns")

        new_numerical_features = temp.select_dtypes("number").columns.tolist()
        new_categorical_features = temp.select_dtypes("object").columns.tolist()

        # Define transformers for preprocessing
        numeric_transformer = pipeline.make_pipeline(
            impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
        )

        categorical_transformer = pipeline.make_pipeline(
            preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            impute.SimpleImputer(strategy="median"),
        )

        # Create a ColumnTransformer to handle both numerical and categorical features
        preprocessor = compose.make_column_transformer(
            (numeric_transformer, new_numerical_features),
            (categorical_transformer, new_categorical_features),
        ).set_output(transform="pandas")

        model_pipeline = pipeline.make_pipeline(
            preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
        )

        scores = model_selection.cross_val_score(
            estimator=model_pipeline,
            X=temp,
            y=y,
            scoring="neg_root_mean_squared_error",
            cv=10,
        )

        # Store the results as a tuple
        result = (feature, np.mean(scores), np.std(scores))
        results.append(result)

        del temp, scores, result

    result_df = pd.DataFrame(results, columns=["feature", "mean_OOFs", "std_OOFs"])
    result_df = result_df.sort_values(by="mean_OOFs", ascending=False)
    return result_df

In [None]:
%%script echo skipping

df_FE_try_ideas = FE_try_ideas(X,y)
df_FE_try_ideas

**Best performances :**

| feature  | mean_OOFs | std_OOFs |
| :---------------- | :------: | :----: |
| energy_efficiency | -0.099122	 | 0.004271|
| area_per_room | -0.099712	 | 0.003463 |
| plot_to_livings_area | -0.099972| 0.004570 |


The best result was obtained was the incorporation of the `energy_efficiency` feature. The resulting OOF RMSE is *0.099122* which is slightly better than our base model (*0.099271*).

## Summarize the feature engineering

In [None]:
pd.DataFrame(
    data={
        "condition": [
            "Utilize categorical columns for grouping",
            "Generate bins from the continuous variables",
            "Introduce polynomial features",
            "Empirical observations",
            "Original",
        ],
        "mean_OOFs": [0.099005, 0.099286, 0.098392, 0.099122, 0.099271],
    }
).sort_values(by="mean_OOFs")

As you can see, with the exception of `Generate bins from the continuous variables`, the generated features scored better average validation RMSE values compared to the original setup, where no feature engineering applied. 

## Final feature selection 

In [None]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.assign(
            energy_efficiency=lambda df: X.primary_energy_consumption / X.living_area,
            tr_prov_liv=X.groupby("province")["living_area"].transform("mean"),
            tr_heat_liv=X.groupby("heating_type")["living_area"].transform("mean"),
            tr_buil_energy=X.groupby("building_condition")[
                "primary_energy_consumption"
            ].transform("mean"),
        )[["energy_efficiency", "tr_prov_liv", "tr_heat_liv", "tr_buil_energy"]]

    def get_feature_names_out(self):
        pass

In [None]:
new_numerical_features = [
    item
    for item in X.select_dtypes("number").columns.tolist()
    if item not in ["zip_code", "bedrooms", "primary_energy_consumption"]
]
new_categorical_features = X.select_dtypes("object").columns.tolist()

polyfeatures = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="median"),
    preprocessing.PolynomialFeatures(interaction_only=False, include_bias=False),
    preprocessing.MinMaxScaler(),
)

feature_adder = pipeline.make_pipeline(
    FeatureAdder(),
    impute.SimpleImputer(strategy="median"),
    preprocessing.MinMaxScaler(),
)

# Define transformers for preprocessing
numeric_transformer = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
)

categorical_transformer = pipeline.make_pipeline(
    preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    impute.SimpleImputer(strategy="median"),
)

# Create a ColumnTransformer to handle both numerical and categorical features
preprocessor = compose.make_column_transformer(
    (polyfeatures, ["zip_code", "bedrooms", "primary_energy_consumption"]),
    (
        feature_adder,
        [
            "primary_energy_consumption",
            "living_area",
            "province",
            "heating_type",
            "building_condition",
        ],
    ),
    (numeric_transformer, new_numerical_features),
    (categorical_transformer, new_categorical_features),
).set_output(transform="pandas")

# model_pipeline = pipeline.make_pipeline(
#     preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
# )

# scores = model_selection.cross_val_score(
#     estimator=model_pipeline,
#     X=X,
#     y=y,
#     scoring="neg_root_mean_squared_error",
#     cv=10,
# )

In [None]:
preprocessor.fit_transform(X)

In [None]:
results = []
for n_features in tqdm(range(2, 55)):
    model_pipeline = pipeline.make_pipeline(
        preprocessor,
        feature_selection.SelectFromModel(
            catboost.CatBoostRegressor(iterations=100, silent=True),
            max_features=n_features,
        ),
        catboost.CatBoostRegressor(iterations=100, silent=True),
    )

    # Get the names of the features selected
    model_pipeline.fit(X, y)
    selected_names = model_pipeline.named_steps[
        "selectfrommodel"
    ].get_feature_names_out()

    scores = model_selection.cross_val_score(
        estimator=model_pipeline,
        X=X,
        y=y,
        scoring="neg_root_mean_squared_error",
        cv=10,
    )
    result = (n_features, selected_names, np.mean(scores), np.std(scores))
    results.append(result)

In [None]:
pd.DataFrame(results, columns=["n_features", "selected_names", "mean_OOF", "STD_OOF"])