In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from boruta import BorutaPy

In [3]:
df = pd.read_csv('data/houses_Madrid_cleaned.csv')

## Traitement

In [4]:
# Séparation des caractéristiques et de la cible
caracteristiques = [
    'sq_mt_built',
    'n_rooms',
    'n_bathrooms',
    'floor',
    'has_lift',
    'is_exterior',
    'has_parking'
]
X = df[caracteristiques]
y = df['buy_price']

# Remplir les valeurs manquantes
X.fillna(0, inplace=True)
y.fillna(y.mean(), inplace=True)

# Encodage des variables catégorielles
X_encode = pd.get_dummies(X, drop_first=True)

# Division en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_encode, y, test_size=0.3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


## Regression Multiple

In [5]:
def ridge_regression():
    print("Entraînement du modèle Ridge...")
    model = Ridge()
    param_grid = {'alpha': [0.1, 1.0, 10.0, 100.0]}
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    # Prédiction et évaluation
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Meilleurs hyperparamètres : {grid_search.best_params_}")
    print(f'MSE : {mse}')
    print(f'R² : {r2}\n')

ridge_regression()

Entraînement du modèle Ridge...
Meilleurs hyperparamètres : {'alpha': 1.0}
MSE : 153471045033.51425
R² : 0.7054048446048417



## Boruta Forest

In [7]:
def random_forest_boruta():
    print("Entraînement du modèle RandomForest avec Boruta...")
    rf = RandomForestRegressor(n_estimators=200, n_jobs=-1)
    boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', max_iter=30)
    
    # Appliquer Boruta
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    boruta_selector.fit(X_train_scaled, y_train)
    selected_features = X.columns[boruta_selector.support_]
    
    # Utiliser les caractéristiques sélectionnées
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    
    # GridSearchCV pour RandomForest
    rf_model = RandomForestRegressor(random_state=42)
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    }
    
    grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_selected, y_train)
    
    # Prédiction et évaluation
    y_pred = grid_search.best_estimator_.predict(X_test_selected)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Meilleurs hyperparamètres : {grid_search.best_params_}")
    print(f'MSE : {mse}')
    print(f'R² : {r2}\n')

random_forest_boruta()

Entraînement du modèle RandomForest avec Boruta...


IndexError: boolean index did not match indexed array along axis 0; size of axis is 7 but size of corresponding boolean axis is 25

## XGBoost

In [8]:
def xgboost_model():
    print("Entraînement du modèle XGBoost...")
    
    # Création du pipeline
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)
    ])
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBRegressor(objective='reg:squarederror'))
    ])
    
    # Grille d'hyperparamètres
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.3],
        'classifier__max_depth': [3, 5, 7]
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Prédiction et évaluation
    y_pred = grid_search.best_estimator_.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Meilleurs hyperparamètres : {grid_search.best_params_}")
    print(f'MSE : {mse}')
    print(f'R² : {r2}\n')

xgboost_model()

Entraînement du modèle XGBoost...


ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'floor'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_indexing.py", line 361, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'floor'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 968, in fit_transform
    self._validate_column_callables(X)
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 536, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\QWERTY\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_indexing.py", line 369, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
