In [None]:
import itertools

import catboost
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import (compose, feature_selection, impute, metrics,
                     model_selection, pipeline, preprocessing)
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

import creds
import data

# Retrieve and prepare data

In [None]:
df = data.retrieve_data_from_MongoDB(
    "development", "BE_houses", {"day_of_retrieval": "2024-02-09"}
)

In [None]:
X, y = data.preprocess_and_split_data(df)

print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

print(f"Shape of X_train : {X_train.shape}, X_test : {X_test.shape}")

In [None]:
X_train.isna().sum().sort_values()

In [None]:
NUMERICAL_FEATURES = X_train.select_dtypes("number").columns.tolist()
CATEGORICAL_FEATURES = X_train.select_dtypes("object").columns.tolist()

print(NUMERICAL_FEATURES)
print(CATEGORICAL_FEATURES)

In [None]:
print("Unique values in categorical columns:")
for column in X_train[CATEGORICAL_FEATURES]:
    print(f"{column} : {X_train[column].nunique()}")

# Define a basic pipeline to use for feature engineering

In [None]:
def create_pipeline(
    numerical_features, categorical_features, additional_transformers=None
):
    numeric_transformer = pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"), preprocessing.StandardScaler()
    )

    categorical_transformer = pipeline.make_pipeline(
        preprocessing.OrdinalEncoder(
            handle_unknown="use_encoded_value", unknown_value=999
        ),
        impute.SimpleImputer(strategy="median"),
    )

    # Create a ColumnTransformer to handle both numerical and categorical features
    transformers = [
        (numeric_transformer, numerical_features),
        (categorical_transformer, categorical_features),
    ]

    if additional_transformers is not None:
        transformers.extend(additional_transformers)

    preprocessor = compose.make_column_transformer(*transformers).set_output(
        transform="pandas"
    )

    model_pipeline = pipeline.make_pipeline(
        preprocessor,
        catboost.CatBoostRegressor(
            iterations=100,
            eval_fraction=0.2,
            early_stopping_rounds=20,
            silent=True,
            use_best_model=True,
        ),
    )

    return model_pipeline


create_pipeline(NUMERICAL_FEATURES, CATEGORICAL_FEATURES)

# Feature engineering
## Utilize categorical columns for grouping and transform each numerical variable based on the mean

In [None]:
class CategoricalColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_feature, numerical_feature, transform_type):
        self.categorical_feature = categorical_feature
        self.numerical_feature = numerical_feature
        self.transform_type = transform_type

    def fit(self, X, y=None):
        # Calculate transformation of numerical_feature based on training data
        self.transform_values_ = X.groupby(self.categorical_feature)[
            self.numerical_feature
        ].agg(self.transform_type)
        return self

    def transform(self, X, y=None):
        # Apply transformation to dataset
        return X.assign(
            CategoricalColumnTransformer=lambda df: df[self.categorical_feature].map(
                self.transform_values_
            )
        )[["CategoricalColumnTransformer"]]

    def get_feature_names_out(self):
        pass

In [None]:
%%script echo skipping

results = []
for categorical_feature in tqdm(CATEGORICAL_FEATURES, desc="Progress"):
    for numerical_feature in NUMERICAL_FEATURES:
        feature_adder = CategoricalColumnTransformer(
            categorical_feature=categorical_feature,
            numerical_feature=numerical_feature,
            transform_type="mean",
        )
        additional_transformers = [
            (feature_adder, [categorical_feature, numerical_feature])
        ]
        model_pipeline = create_pipeline(
            numerical_features=NUMERICAL_FEATURES,
            categorical_features=CATEGORICAL_FEATURES,
            additional_transformers=additional_transformers,
        )

        scores = model_selection.cross_validate(
            estimator=model_pipeline,
            X=X_train,
            y=y_train,
            scoring=("r2", "neg_root_mean_squared_error"),
            cv=10,
        )

        result = (
            categorical_feature,
            numerical_feature,
            np.mean(scores["test_neg_root_mean_squared_error"]),
            np.mean(scores["test_r2"]),
        )
        results.append(result)

**Best performances :**

| categorical_feature  | numerical_feature | mean_OOFs |
| :---------------- | :------: | :----: |
| energy_class | zip_code | 0.092231		  | 
| building_condition | construction_year | 0.092441  | 
| building_condition | number_of_frontages | 0.092443	|


The best result was obtained by taking the `energy_class` feature as categorical variable and calculating the mean of `zip_code`. The resulting OOF RMSE is *0.092231* which is slightly better than our base model (*0.09326*).

## Generate bins from the continuous variables


In [None]:
class ContinuousColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        continuous_feature_to_bin,
        continuous_feature_to_transfer,
        transform_type,
        n_bins,
    ):
        self.continuous_feature_to_bin = continuous_feature_to_bin
        self.continuous_feature_to_transfer = continuous_feature_to_transfer
        self.transform_type = transform_type
        self.n_bins = n_bins

    def fit(self, X, y=None):
        # Determine bin edges based on training data
        self.bin_edges_ = pd.qcut(
            x=X[self.continuous_feature_to_bin],
            q=self.n_bins,
            retbins=True,
            duplicates="drop",
        )[1]

        # Calculate transformation of continuous_feature_to_transfer based on training data
        self.transform_values_ = (
            X.assign(
                binned_continuous_feature=lambda df: pd.cut(
                    df[self.continuous_feature_to_bin],
                    bins=self.bin_edges_,
                    labels=False,
                )
            )
            .groupby("binned_continuous_feature")[self.continuous_feature_to_transfer]
            .agg(self.transform_type)
        )
        return self

    def transform(self, X, y=None):
        # Apply binning and transformation to dataset
        return X.assign(
            binned_continuous_feature=lambda df: pd.cut(
                df[self.continuous_feature_to_bin], bins=self.bin_edges_, labels=False
            )
        ).assign(
            ContinuousColumnTransformer=lambda df: df["binned_continuous_feature"].map(
                self.transform_values_
            )
        )[
            ["ContinuousColumnTransformer"]
        ]

    def get_feature_names_out(self):
        pass

In [None]:
%%script echo skipping

optimal_bins = int(np.floor(np.log2(X_train.shape[0])) + 1)
results = []
# Combine the loops to have a single progress bar
for discretized_continuous in tqdm(NUMERICAL_FEATURES, desc="Progress"):
    for transformed_continuous in NUMERICAL_FEATURES:
        if discretized_continuous != transformed_continuous:
            continuous_discretizer = ContinuousColumnTransformer(
                continuous_feature_to_bin=discretized_continuous,
                continuous_feature_to_transfer=transformed_continuous,
                transform_type="mean",
                n_bins=optimal_bins,
            )

            additional_transformers = [
                (
                    continuous_discretizer,
                    [discretized_continuous, transformed_continuous],
                )
            ]

            model_pipeline = create_pipeline(
                numerical_features=NUMERICAL_FEATURES,
                categorical_features=CATEGORICAL_FEATURES,
                additional_transformers=additional_transformers,
            )
            scores = model_selection.cross_validate(
                estimator=model_pipeline,
                X=X_train,
                y=y_train,
                scoring=("r2", "neg_root_mean_squared_error"),
                cv=10,
            )
            result = (
                discretized_continuous,
                transformed_continuous,
                np.mean(scores["test_neg_root_mean_squared_error"]),
                np.mean(scores["test_r2"]),
            )
            results.append(result)

**Best performances :**

| discretized_continuous  | transformed_continuous | mean_OOFs |
| :---------------- | :------: | :----: |
| living_area | bathrooms | 0.092563	  |
| primary_energy_consumption | toilets | 0.092683  | 
| living_area | bedrooms | 0.092728  |


The best result was obtained by taking the `living_area` feature as discretized continuous variable and calculating the mean of `bathrooms`. The resulting OOF RMSE is *0.092563* which is slightly better than our base model (*0.09326*).


## Introduce polynomial features


In [None]:
%%script echo skipping
results = []
for numerical_feature in tqdm(NUMERICAL_FEATURES, desc="Progress"):
    
    NEW_NUMERICAL_FEATURES = NUMERICAL_FEATURES.copy() 
    NEW_NUMERICAL_FEATURES.remove(numerical_feature)
    
    polyfeatures = pipeline.make_pipeline(
                    impute.SimpleImputer(strategy="median"),
                    preprocessing.PolynomialFeatures(
                        interaction_only=False, include_bias=False
                    )
                )
    additional_transformers = [
            (polyfeatures, [numerical_feature])
        ]
    
    model_pipeline = create_pipeline(
            numerical_features=NEW_NUMERICAL_FEATURES,
            categorical_features=CATEGORICAL_FEATURES,
            additional_transformers=additional_transformers,
        )
    
    scores = model_selection.cross_validate(
            estimator=model_pipeline,
            X=X_train,
            y=y_train,
            scoring=("r2", "neg_root_mean_squared_error"),
            cv=10,
        )
    
    result = (
            numerical_feature,
            np.mean(scores["test_neg_root_mean_squared_error"]),
            np.mean(scores["test_r2"]),
        )
    results.append(result)

In [None]:
pd.DataFrame(results).sort_values(by=2, ascending=False).head()

**Best performances :**

| numerical_col  | mean_OOFs |
| :---------------- | :------: |
| surface_of_the_plot | 0.093436	 | 
| zip_code | 0.093765	 | 
| construction_year | 0.093788| 


The best result was obtained by taking the polynomial feature of the `surface_of_the_plot`. The resulting OOF RMSE is *0.093436* which is slightly worse than our base model (*0.09326*).

## Implement other ideas derived from empirical observations or assumptions

In [None]:
class EmpiricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Apply transformation to dataset
        return X.assign(
            energy_efficiency=lambda df: df.primary_energy_consumption / df.living_area,
            total_rooms=lambda df: df.bathrooms + df.bedrooms,
            bedroom_to_bathroom=lambda df: df.bedrooms / df.bathrooms,
            area_per_room=lambda df: df.living_area / df.bedrooms,
            plot_to_livings_area=lambda df: df.surface_of_the_plot / df.living_area,
        ).loc[:, "energy_efficiency":]

    def get_feature_names_out(self):
        pass

In [None]:
%%script echo skipping
numeric_transformer = pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"), preprocessing.StandardScaler()
    )

categorical_transformer = pipeline.make_pipeline(
    preprocessing.OrdinalEncoder(
        handle_unknown="use_encoded_value", unknown_value=999
    ),
    impute.SimpleImputer(strategy="median"),
)

empirical_transformer = pipeline.make_pipeline(
    EmpiricalTransformer(),
    impute.SimpleImputer(strategy="median"),
)

# Create a ColumnTransformer to handle both numerical and categorical features
transformers = [
    (numeric_transformer, NUMERICAL_FEATURES),
    (categorical_transformer, CATEGORICAL_FEATURES),
    (empirical_transformer, ["primary_energy_consumption", 
                             "living_area", "bathrooms", 
                             "bedrooms", "surface_of_the_plot",
                            ])
    
]

preprocessor = compose.make_column_transformer(*transformers).set_output(
    transform="pandas"
)

results = []
for column in temp_dataframe.columns[-5:]:

    temp_dataframe = preprocessor.fit_transform(X_train)
    
    added_features = temp_dataframe.columns[-5:].tolist()
    features_to_remove = added_features.copy()  
    features_to_remove.remove(column)

    new_X_train = temp_dataframe.drop(columns = features_to_remove)

    regressor = catboost.CatBoostRegressor(
                iterations=100,
                eval_fraction=0.2,
                early_stopping_rounds=20,
                silent=True,
                use_best_model=True)
    
    scores = model_selection.cross_validate(
            estimator=regressor,
            X=new_X_train,
            y=y_train,
            scoring=("r2", "neg_root_mean_squared_error"),
            cv=10,
        )
    
    result = (
            column,
            np.mean(scores["test_neg_root_mean_squared_error"]),
            np.mean(scores["test_r2"]),
        )
    results.append(result)

**Best performances :**

| feature  | mean_OOFs | 
| :---------------- | :------: | 
| 3__plot_to_livings_area | 0.093675	 | 
| 3__bedroom_to_bathroom | 0.093770	 | 
| 3__area_per_room | 0.094004|


The best result was obtained was the incorporation of the `3__plot_to_livings_area` feature. The resulting OOF RMSE is *0.093675* which is slightly worse than our base model (*0.09326*).

## Summarize the feature engineering

In [None]:
pd.DataFrame(
    data={
        "condition": [
            "Utilize categorical columns for grouping",
            "Generate bins from the continuous variables",
            "Introduce polynomial features",
            "Empirical observations",
            "Original",
        ],
        "mean_OOFs": [0.092231, 0.092563, 0.093436, 0.093675, 0.09326],
    }
).sort_values(by="mean_OOFs")

As you can see, with the exception of `Generate bins from the continuous variables`, the generated features scored better average validation RMSE values compared to the original setup, where no feature engineering applied. 

## Final feature selection 

In [None]:
categorical_column_transformer = CategoricalColumnTransformer(
    categorical_feature="energy_class",
    numerical_feature="zip_code",
    transform_type="mean",
)

continuous_discretizer = ContinuousColumnTransformer(
    continuous_feature_to_bin="living_area",
    continuous_feature_to_transfer="bathrooms",
    transform_type="mean",
    n_bins=optimal_bins,
)

In [None]:
%%script echo skipping
results = []
for n_features in tqdm(range(2, 17)):
    numeric_transformer = pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"), preprocessing.StandardScaler()
    )
    
    categorical_transformer = pipeline.make_pipeline(
        preprocessing.OrdinalEncoder(
            handle_unknown="use_encoded_value", unknown_value=999
        ),
        impute.SimpleImputer(strategy="median"),
    )
    
    # Create a ColumnTransformer to handle both numerical and categorical features
    transformers = [
        (numeric_transformer, NUMERICAL_FEATURES),
        (categorical_transformer, CATEGORICAL_FEATURES),
        # (categorical_column_transformer, ["energy_class", "zip_code"]),
        (continuous_discretizer, ["living_area", "bathrooms"])
    ]
    
    preprocessor = compose.make_column_transformer(*transformers).set_output(
        transform="pandas"
    )
    
    model_pipeline = pipeline.make_pipeline(
        preprocessor,
        feature_selection.RFE(
            catboost.CatBoostRegressor(
        iterations=100,
        eval_fraction=0.2,
        early_stopping_rounds=20,
        silent=True,
        use_best_model=True,
    ),
            n_features_to_select=n_features,
        ),
        catboost.CatBoostRegressor(
        iterations=100,
        eval_fraction=0.2,
        early_stopping_rounds=20,
        silent=True,
        use_best_model=True,
    )
    )

    # Get the names of the features selected
    model_pipeline.fit(X_train, y_train)
    selected_names = model_pipeline.named_steps[
        "rfe"
    ].get_feature_names_out()

    scores = model_selection.cross_val_score(
        estimator=model_pipeline,
        X=X_train,
        y=y_train,
        scoring="neg_root_mean_squared_error",
        cv=10,
    )
    result = (n_features, selected_names, np.mean(scores), np.std(scores))
    results.append(result)

**Best performances :**

| Condition  | n features to keep | n features to remove | AVG_test_RMSE |
| :---------------- | :------: |  :------: |  :------: |
| categorical_column_transformer AND continuous_discretizer|	14 | 3 | 0.09281 | 
| categorical_column_transformer | 16	 | 0	 | 0.092231	 | 
| continuous_discretizer | 16	 | 0	 | 0.092563	 | 

Based on this experiment, we will only keep the categorical_column_transformer step with all other available features.

In [None]:
categorical_column_transformer = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
    CategoricalColumnTransformer(
        categorical_feature="energy_class",
        numerical_feature="zip_code",
        transform_type="mean",
    ),
)

numeric_transformer = pipeline.make_pipeline(impute.SimpleImputer(strategy="median"))

categorical_transformer = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
    preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999),
)

# Create a ColumnTransformer to handle both numerical and categorical features
transformers = [
    (numeric_transformer, NUMERICAL_FEATURES),
    (categorical_transformer, CATEGORICAL_FEATURES),
    (categorical_column_transformer, ["energy_class", "zip_code"]),
]

preprocessor = compose.make_column_transformer(*transformers).set_output(
    transform="pandas"
)

model_pipeline = pipeline.make_pipeline(
    preprocessor,
    catboost.CatBoostRegressor(
        iterations=100,
        eval_fraction=0.2,
        early_stopping_rounds=20,
        silent=True,
        use_best_model=True,
    ),
)

scores = model_selection.cross_val_score(
    estimator=model_pipeline,
    X=X_train,
    y=y_train,
    scoring="neg_root_mean_squared_error",
    cv=10,
)

np.mean(scores)

In [None]:
param_distributions = {
    "catboostregressor__iterations": stats.randint(100, 1000),
    "catboostregressor__learning_rate": stats.loguniform(0.005, 0.01),
    "catboostregressor__depth": stats.randint(2, 12),
    "catboostregressor__l2_leaf_reg": stats.loguniform(1e-3, 1e3),
    "catboostregressor__border_count": stats.randint(1, 255),
    "catboostregressor__bagging_temperature": stats.uniform(0, 1),
    "catboostregressor__random_strength": stats.uniform(0, 1),
}

grid = model_selection.RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_distributions,
    scoring="neg_root_mean_squared_error",
    n_iter=50,
    n_jobs=-1,
)
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_