In [None]:
import itertools

import catboost
import numpy as np
import pandas as pd
from sklearn import (compose, feature_selection, impute, metrics,
                     model_selection, pipeline, preprocessing)
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

import creds
import data

# Retrieve and prepare data

In [None]:
df = data.retrieve_data_from_MongoDB(
    "development", "BE_houses", {"day_of_retrieval": "2024-02-09"}
)

In [None]:
X, y = data.preprocess_and_split_data(df)

print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

print(f"Shape of X_train : {X_train.shape}, X_test : {X_test.shape}")

In [None]:
X_train.isna().sum().sort_values()

In [None]:
NUMERICAL_FEATURES = X_train.select_dtypes("number").columns.tolist()
CATEGORICAL_FEATURES = X_train.select_dtypes("object").columns.tolist()

print(NUMERICAL_FEATURES)
print(CATEGORICAL_FEATURES)

In [None]:
print("Unique values in categorical columns:")
for column in X_train[CATEGORICAL_FEATURES]:
    print(f"{column} : {X_train[column].nunique()}")

# Define a basic pipeline to use for feature engineering

In [None]:
def create_pipeline(
    numerical_features, categorical_features, additional_transformers=None
):
    numeric_transformer = pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"), preprocessing.StandardScaler()
    )

    categorical_transformer = pipeline.make_pipeline(
        preprocessing.OrdinalEncoder(
            handle_unknown="use_encoded_value", unknown_value=999
        ),
        impute.SimpleImputer(strategy="median"),
    )

    # Create a ColumnTransformer to handle both numerical and categorical features
    transformers = [
        (numeric_transformer, numerical_features),
        (categorical_transformer, categorical_features),
    ]

    if additional_transformers is not None:
        transformers.extend(additional_transformers)

    preprocessor = compose.make_column_transformer(*transformers).set_output(
        transform="pandas"
    )

    model_pipeline = pipeline.make_pipeline(
        preprocessor,
        catboost.CatBoostRegressor(
            iterations=100,
            eval_fraction=0.2,
            early_stopping_rounds=20,
            silent=True,
            use_best_model=True,
        ),
    )

    return model_pipeline


create_pipeline(NUMERICAL_FEATURES, CATEGORICAL_FEATURES)

# Feature engineering
## Utilize categorical columns for grouping and transform each numerical variable based on the mean

In [None]:
class CategoricalColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_feature, numerical_feature, transform_type):
        self.categorical_feature = categorical_feature
        self.numerical_feature = numerical_feature
        self.transform_type = transform_type

    def fit(self, X, y=None):
        # Calculate transformation of numerical_feature based on training data
        self.transform_values_ = X.groupby(self.categorical_feature)[
            self.numerical_feature
        ].agg(self.transform_type)
        return self

    def transform(self, X, y=None):
        # Apply transformation to dataset
        return X.assign(
            new_feature=lambda df: df[self.categorical_feature].map(
                self.transform_values_
            )
        )[["new_feature"]]

    def get_feature_names_out(self):
        pass

In [None]:
results = []
for categorical_feature in tqdm(CATEGORICAL_FEATURES, desc="Progress"):
    for numerical_feature in NUMERICAL_FEATURES:
        feature_adder = CategoricalColumnTransformer(
            categorical_feature=categorical_feature,
            numerical_feature=numerical_feature,
            transform_type="mean",
        )
        additional_transformers = [
            (feature_adder, [categorical_feature, numerical_feature])
        ]
        model_pipeline = create_pipeline(
            numerical_features=NUMERICAL_FEATURES,
            categorical_features=CATEGORICAL_FEATURES,
            additional_transformers=additional_transformers,
        )

        scores = model_selection.cross_validate(
            estimator=model_pipeline,
            X=X_train,
            y=y_train,
            scoring=("r2", "neg_root_mean_squared_error"),
            cv=10,
        )

        result = (
            categorical_feature,
            numerical_feature,
            np.mean(scores["test_neg_root_mean_squared_error"]),
            np.mean(scores["test_r2"]),
        )
        results.append(result)

In [None]:
pd.DataFrame(results).sort_values(by=2, ascending=False).head()

**Best performances :**

| categorical_feature  | numerical_feature | mean_OOFs |
| :---------------- | :------: | :----: |
| energy_class | zip_code | 0.09337	  | 
| building_condition | bedrooms | 0.093277  | 
| building_condition | living_area | 0.093333	|


The best result was obtained by taking the `heating_type` feature as categorical variable and calculating the mean of `bedrooms`. The resulting OOF RMSE is *0.09305* which is slightly better than our base model (*0.09326*).

## Generate bins from the continuous variables


In [None]:
class ContinuousColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        continuous_feature_to_bin,
        continuous_feature_to_transfer,
        transform_type,
        n_bins,
    ):
        self.continuous_feature_to_bin = continuous_feature_to_bin
        self.continuous_feature_to_transfer = continuous_feature_to_transfer
        self.transform_type = transform_type
        self.n_bins = n_bins

    def fit(self, X, y=None):
        # Determine bin edges based on training data
        self.bin_edges_ = pd.qcut(
            x=X[self.continuous_feature_to_bin],
            q=self.n_bins,
            retbins=True,
            duplicates="drop",
        )[1]

        # Calculate transformation of continuous_feature_to_transfer based on training data
        self.transform_values_ = (
            X.assign(
                binned_continuous_feature=lambda df: pd.cut(
                    df[self.continuous_feature_to_bin],
                    bins=self.bin_edges_,
                    labels=False,
                )
            )
            .groupby("binned_continuous_feature")[self.continuous_feature_to_transfer]
            .agg(self.transform_type)
        )
        return self

    def transform(self, X, y=None):
        # Apply binning and transformation to dataset
        return X.assign(
            binned_continuous_feature=lambda df: pd.cut(
                df[self.continuous_feature_to_bin], bins=self.bin_edges_, labels=False
            )
        ).assign(
            new_feature=lambda df: df["binned_continuous_feature"].map(
                self.transform_values_
            )
        )[
            ["new_feature"]
        ]

    def get_feature_names_out(self):
        pass

In [None]:
optimal_bins = int(np.floor(np.log2(X_train.shape[0])) + 1)
results = []
# Combine the loops to have a single progress bar
for discretized_continuous in tqdm(NUMERICAL_FEATURES, desc="Progress:"):
    for transformed_continuous in NUMERICAL_FEATURES:
        if discretized_continuous != transformed_continuous:
            continuous_discretizer = ContinuousColumnTransformer(
                continuous_feature_to_bin=discretized_continuous,
                continuous_feature_to_transfer=transformed_continuous,
                transform_type="mean",
                n_bins=optimal_bins,
            )

            additional_transformers = [
                (
                    continuous_discretizer,
                    [discretized_continuous, transformed_continuous],
                )
            ]

            model_pipeline = create_pipeline(
                numerical_features=NUMERICAL_FEATURES,
                categorical_features=CATEGORICAL_FEATURES,
                additional_transformers=additional_transformers,
            )
            scores = model_selection.cross_validate(
                estimator=model_pipeline,
                X=X_train,
                y=y_train,
                scoring=("r2", "neg_root_mean_squared_error"),
                cv=10,
            )
            result = (
                discretized_continuous,
                transformed_continuous,
                np.mean(scores["test_neg_root_mean_squared_error"]),
                np.mean(scores["test_r2"]),
            )
            results.append(result)

In [None]:
pd.DataFrame(results).sort_values(by=2, ascending=False).head()

**Best performances :**

| discretized_continuous  | transformed_continuous | mean_OOFs |
| :---------------- | :------: | :----: |
| primary_energy_consumption | zip_code | -0.099286	  |
| primary_energy_consumption | living_area | -0.099351  | 
| primary_energy_consumption | surface_of_the_plot | -0.099562  |


The best result was obtained by taking the `0.003809` feature as discretized continuous variable and calculating the mean of `zip_code`. The resulting OOF RMSE is *0.099286* which is slightly worse than our base model (*0.099271*).


## Introduce polynomial features


In [None]:
def FE_polynomial_features(
    X: pd.DataFrame, y: pd.Series, combinations: int = 1
) -> pd.DataFrame:
    results = []

    # Get a list of continuous and numerical columns
    numerical_columns = X.select_dtypes("number").columns

    # Combine the loops to have a single progress bar
    for numerical_col in tqdm(
        list(itertools.combinations(numerical_columns, r=combinations))
    ):
        polyfeatures = compose.make_column_transformer(
            (
                pipeline.make_pipeline(
                    impute.SimpleImputer(strategy="median"),
                    preprocessing.PolynomialFeatures(
                        interaction_only=False, include_bias=False
                    ),
                ),
                list(numerical_col),
            ),
            remainder="passthrough",
        ).set_output(transform="pandas")

        temp = polyfeatures.fit_transform(X)

        new_numerical_features = temp.select_dtypes("number").columns.tolist()
        new_categorical_features = temp.select_dtypes("object").columns.tolist()

        # Define transformers for preprocessing
        numeric_transformer = pipeline.make_pipeline(
            impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
        )

        categorical_transformer = pipeline.make_pipeline(
            preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            impute.SimpleImputer(strategy="median"),
        )

        # Create a ColumnTransformer to handle both numerical and categorical features
        preprocessor = compose.make_column_transformer(
            (numeric_transformer, new_numerical_features),
            (categorical_transformer, new_categorical_features),
        ).set_output(transform="pandas")

        model_pipeline = pipeline.make_pipeline(
            preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
        )

        scores = model_selection.cross_val_score(
            estimator=model_pipeline,
            X=temp,
            y=y,
            scoring="neg_root_mean_squared_error",
            cv=10,
        )

        # Store the results as a tuple
        result = (numerical_col, np.mean(scores), np.std(scores))
        results.append(result)

        del temp, scores, result

    result_df = pd.DataFrame(
        results, columns=["numerical_col", "mean_OOFs", "std_OOFs"]
    )
    result_df = result_df.sort_values(by="mean_OOFs", ascending=False)
    return result_df

In [None]:
%%script echo skipping
FE_polynomial_features(X,y)

**Best performances :**

| numerical_col  | mean_OOFs | std_OOFs |
| :---------------- | :------: | :----: |
| zip_code | -0.098392	 | 0.004372	  |
| bedrooms | -0.098816	 | 0.003782 |
| primary_energy_consumption | -0.098900| 0.004057 |


The best result was obtained by taking the polynomial feature of the `zip_code`. The resulting OOF RMSE is *0.098392* which is slightly better than our base model (*0.099271*).

## Implement other ideas derived from empirical observations or assumptions

In [None]:
def FE_ideas(X):
    """Performs additional feature engineering on the input DataFrame.

    Args:
        X (pd.DataFrame): The input DataFrame containing the original features.

    Returns:
        pd.DataFrame: A DataFrame with additional engineered features.

    Example:
        >>> engineered_data = FE_ideas(original_data)
    """
    temp = X.assign(
        energy_efficiency=lambda df: df.primary_energy_consumption / df.living_area,
        total_rooms=lambda df: df.bathrooms + df.bedrooms,
        bedroom_to_bathroom=lambda df: df.bedrooms / df.bathrooms,
        area_per_room=lambda df: df.living_area / df.bedrooms,
        plot_to_livings_area=lambda df: df.surface_of_the_plot / df.living_area,
    )
    return temp.loc[:, "energy_efficiency":]


FE_ideas(X)

In [None]:
def FE_try_ideas(
    X: pd.DataFrame,
    y: pd.Series,
) -> pd.DataFrame:
    """Performs feature engineering experiments by adding new features and evaluating their impact on model performance.

    Args:
        X (pd.DataFrame): The input feature matrix.
        y (pd.Series): The target variable.

    Returns:
        pd.DataFrame: A DataFrame containing the results of feature engineering experiments.

    Example:
        >>> results_df = FE_try_ideas(X, y)
    """
    # Initialize a list to store results
    results = []

    # Apply additional feature engineering ideas
    feature_df = FE_ideas(X)

    for feature in tqdm(feature_df.columns):
        # Concatenate the original features with the newly engineered feature
        temp = pd.concat([X, feature_df[feature]], axis="columns")

        new_numerical_features = temp.select_dtypes("number").columns.tolist()
        new_categorical_features = temp.select_dtypes("object").columns.tolist()

        # Define transformers for preprocessing
        numeric_transformer = pipeline.make_pipeline(
            impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
        )

        categorical_transformer = pipeline.make_pipeline(
            preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            impute.SimpleImputer(strategy="median"),
        )

        # Create a ColumnTransformer to handle both numerical and categorical features
        preprocessor = compose.make_column_transformer(
            (numeric_transformer, new_numerical_features),
            (categorical_transformer, new_categorical_features),
        ).set_output(transform="pandas")

        model_pipeline = pipeline.make_pipeline(
            preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
        )

        scores = model_selection.cross_val_score(
            estimator=model_pipeline,
            X=temp,
            y=y,
            scoring="neg_root_mean_squared_error",
            cv=10,
        )

        # Store the results as a tuple
        result = (feature, np.mean(scores), np.std(scores))
        results.append(result)

        del temp, scores, result

    result_df = pd.DataFrame(results, columns=["feature", "mean_OOFs", "std_OOFs"])
    result_df = result_df.sort_values(by="mean_OOFs", ascending=False)
    return result_df

In [None]:
%%script echo skipping

df_FE_try_ideas = FE_try_ideas(X,y)
df_FE_try_ideas

**Best performances :**

| feature  | mean_OOFs | std_OOFs |
| :---------------- | :------: | :----: |
| energy_efficiency | -0.099122	 | 0.004271|
| area_per_room | -0.099712	 | 0.003463 |
| plot_to_livings_area | -0.099972| 0.004570 |


The best result was obtained was the incorporation of the `energy_efficiency` feature. The resulting OOF RMSE is *0.099122* which is slightly better than our base model (*0.099271*).

## Summarize the feature engineering

In [None]:
pd.DataFrame(
    data={
        "condition": [
            "Utilize categorical columns for grouping",
            "Generate bins from the continuous variables",
            "Introduce polynomial features",
            "Empirical observations",
            "Original",
        ],
        "mean_OOFs": [0.099005, 0.099286, 0.098392, 0.099122, 0.099271],
    }
).sort_values(by="mean_OOFs")

As you can see, with the exception of `Generate bins from the continuous variables`, the generated features scored better average validation RMSE values compared to the original setup, where no feature engineering applied. 

## Final feature selection 

In [None]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.assign(
            energy_efficiency=lambda df: X.primary_energy_consumption / X.living_area,
            tr_prov_liv=X.groupby("province")["living_area"].transform("mean"),
            tr_heat_liv=X.groupby("heating_type")["living_area"].transform("mean"),
            tr_buil_energy=X.groupby("building_condition")[
                "primary_energy_consumption"
            ].transform("mean"),
        )[["energy_efficiency", "tr_prov_liv", "tr_heat_liv", "tr_buil_energy"]]

    def get_feature_names_out(self):
        pass

In [None]:
new_numerical_features = [
    item
    for item in X.select_dtypes("number").columns.tolist()
    if item not in ["zip_code", "bedrooms", "primary_energy_consumption"]
]
new_categorical_features = X.select_dtypes("object").columns.tolist()

polyfeatures = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="median"),
    preprocessing.PolynomialFeatures(interaction_only=False, include_bias=False),
    preprocessing.MinMaxScaler(),
)

feature_adder = pipeline.make_pipeline(
    FeatureAdder(),
    impute.SimpleImputer(strategy="median"),
    preprocessing.MinMaxScaler(),
)

# Define transformers for preprocessing
numeric_transformer = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="median"), preprocessing.MinMaxScaler()
)

categorical_transformer = pipeline.make_pipeline(
    preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    impute.SimpleImputer(strategy="median"),
)

# Create a ColumnTransformer to handle both numerical and categorical features
preprocessor = compose.make_column_transformer(
    (polyfeatures, ["zip_code", "bedrooms", "primary_energy_consumption"]),
    (
        feature_adder,
        [
            "primary_energy_consumption",
            "living_area",
            "province",
            "heating_type",
            "building_condition",
        ],
    ),
    (numeric_transformer, new_numerical_features),
    (categorical_transformer, new_categorical_features),
).set_output(transform="pandas")

# model_pipeline = pipeline.make_pipeline(
#     preprocessor, catboost.CatBoostRegressor(iterations=100, silent=True)
# )

# scores = model_selection.cross_val_score(
#     estimator=model_pipeline,
#     X=X,
#     y=y,
#     scoring="neg_root_mean_squared_error",
#     cv=10,
# )

In [None]:
preprocessor.fit_transform(X)

In [None]:
results = []
for n_features in tqdm(range(2, 55)):
    model_pipeline = pipeline.make_pipeline(
        preprocessor,
        feature_selection.SelectFromModel(
            catboost.CatBoostRegressor(iterations=100, silent=True),
            max_features=n_features,
        ),
        catboost.CatBoostRegressor(iterations=100, silent=True),
    )

    # Get the names of the features selected
    model_pipeline.fit(X, y)
    selected_names = model_pipeline.named_steps[
        "selectfrommodel"
    ].get_feature_names_out()

    scores = model_selection.cross_val_score(
        estimator=model_pipeline,
        X=X,
        y=y,
        scoring="neg_root_mean_squared_error",
        cv=10,
    )
    result = (n_features, selected_names, np.mean(scores), np.std(scores))
    results.append(result)

In [None]:
pd.DataFrame(results, columns=["n_features", "selected_names", "mean_OOF", "STD_OOF"])

In [None]:
encoders = [
    preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999),
    preprocessing.TargetEncoder(target_type="continuous"),
]
imputers = [impute.SimpleImputer(strategy="median"), impute.KNNImputer()]
scalers = [
    preprocessing.StandardScaler(),
    preprocessing.MinMaxScaler(),
    preprocessing.RobustScaler(),
]

# Extract numerical and categorical feature names
NUMERICAL_FEATURES = X.select_dtypes("number").columns.tolist()
CATEGORICAL_FEATURES = X.select_dtypes("object").columns.tolist()

results = []

# Iterate over the imputers, encoders and scalers
for imputer in imputers:
    for encoder in encoders:
        for scaler in tqdm(scalers):
            # Define transformers for preprocessing
            numeric_transformer = pipeline.make_pipeline(imputer, scaler)
            categorical_transformer = pipeline.make_pipeline(encoder, imputer)

            # Create a ColumnTransformer to handle both numerical and categorical features
            preprocessor = compose.make_column_transformer(
                (numeric_transformer, NUMERICAL_FEATURES),
                (categorical_transformer, CATEGORICAL_FEATURES),
            )
            model_pipeline = pipeline.make_pipeline(
                preprocessor, catboost.CatBoostRegressor(iterations=10, silent=True)
            )
            scores = model_selection.cross_val_score(
                estimator=model_pipeline,
                X=X,
                y=y,
                scoring="neg_root_mean_squared_error",
                cv=10,
            )

            result = {
                "imputer": str(imputer),
                "encoder": str(encoder),
                "scaler": str(scaler),
                "scores": scores.tolist(),
                "mean_score": np.mean(scores),
                "std_score": np.std(scores),
            }

            results.append(result)

pd.DataFrame(results).sort_values(by="mean_score", ascending=False)