# Mod√®le Ridge Regression - Appartements et Maisons

Entra√Ænement simplifi√© avec Ridge Regression pour les deux types de biens immobiliers.
Utilise les DataFrames existants `df_appartements_model` et `df_maisons_model`.

In [None]:
# Imports des librairies n√©cessaires
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle
import os

print("Imports termin√©s (avec GradientBoosting, GridSearch, Feature Selection et Boruta optionnel)")
print("Imports termin√©s (avec GradientBoosting, GridSearch et Feature Selection)")

 Imports termin√©s (avec GradientBoosting, GridSearch, Feature Selection et Boruta optionnel)
 Imports termin√©s (avec GradientBoosting, GridSearch et Feature Selection)


## 1. V√©rification des donn√©es existantes

In [26]:
# Chargement des donn√©es √† partir des fichiers CSV
df_appartements_model = pd.read_csv('df_appartements_model.csv')
df_maisons_model = pd.read_csv('df_maisons_model.csv')

print(f"Appartements: {df_appartements_model.shape}")
print(f"Maisons: {df_maisons_model.shape}")
print("\nColonnes appartements:")
print(df_appartements_model.columns.tolist())
print("\nColonnes maisons:")
print(df_maisons_model.columns.tolist())

Appartements: (19125, 7)
Maisons: (2617, 9)

Colonnes appartements:
['sq_mt_built', 'n_bathrooms', 'n_rooms', 'has_lift', 'has_parking', 'has_central_heating', 'buy_price']

Colonnes maisons:
['sq_mt_built', 'n_bathrooms', 'n_rooms', 'n_bathrooms.1', 'has_garden', 'has_pool', 'neighborhood', 'has_pool.1', 'buy_price']


## 2. Pr√©paration des donn√©es

In [27]:
# Copie des DataFrames pour √©viter de modifier les originaux
df_apt = df_appartements_model.copy()
df_mai = df_maisons_model.copy()

# Nettoyage simple
df_apt = df_apt.fillna(0)
df_mai = df_mai.fillna(0)

# Features communes disponibles (adapter selon vos colonnes r√©elles)
# V√©rifions d'abord quelles colonnes sont disponibles
apt_columns = df_apt.columns.tolist()
mai_columns = df_mai.columns.tolist()

print(f"Colonnes appartements: {apt_columns}")
print(f"Colonnes maisons: {mai_columns}")

# Features pour chaque type (adapter selon vos donn√©es)
apt_features = [col for col in apt_columns if col not in ['buy_price', 'log_buy_price'] and col in ['sq_mt_built', 'n_bathrooms', 'n_rooms', 'has_lift', 'has_parking', 'has_central_heating']]
mai_features = [col for col in mai_columns if col not in ['buy_price', 'log_buy_price'] and col in ['sq_mt_built', 'n_bathrooms', 'n_rooms', 'has_garden', 'has_pool', 'neighborhood']]

print(f"\nFeatures appartements disponibles: {apt_features}")
print(f"Features maisons disponibles: {mai_features}")

Colonnes appartements: ['sq_mt_built', 'n_bathrooms', 'n_rooms', 'has_lift', 'has_parking', 'has_central_heating', 'buy_price']
Colonnes maisons: ['sq_mt_built', 'n_bathrooms', 'n_rooms', 'n_bathrooms.1', 'has_garden', 'has_pool', 'neighborhood', 'has_pool.1', 'buy_price']

Features appartements disponibles: ['sq_mt_built', 'n_bathrooms', 'n_rooms', 'has_lift', 'has_parking', 'has_central_heating']
Features maisons disponibles: ['sq_mt_built', 'n_bathrooms', 'n_rooms', 'has_garden', 'has_pool', 'neighborhood']


## 3. Train/Test Split

## 4. Feature Selection (Appartements uniquement)

In [45]:
#  FEATURE SELECTION - APPARTEMENTS SEULEMENT (plus de donn√©es = plus robuste)
print(" FEATURE SELECTION POUR APPARTEMENTS")
print("=" * 50)

# Pour les APPARTEMENTS : Feature Selection avec SelectKBest
if apt_features:
    selector_apt = SelectKBest(score_func=f_regression, k=5)
    X_apt_selected = selector_apt.fit_transform(X_apt_train, y_apt_train)
    X_apt_test_selected = selector_apt.transform(X_apt_test)
    
    # Features s√©lectionn√©es
    selected_features_apt = [apt_features[i] for i in selector_apt.get_support(indices=True)]
    scores_apt = selector_apt.scores_[selector_apt.get_support()]
    
    print(f" Appartements: {len(apt_features)} ‚Üí {len(selected_features_apt)} features")
    for feat, score in zip(selected_features_apt, scores_apt):
        print(f"   ‚Ä¢ {feat}: {score:.2f}")

# Pour les MAISONS : PAS de feature selection (dataset trop petit)
if mai_features:
    X_mai_selected = X_mai_train
    X_mai_test_selected = X_mai_test
    selected_features_mai = mai_features
    
    print(f" Maisons: {len(mai_features)} features conserv√©es (√©viter overfitting)")
    print(f"   ‚Ä¢ Features: {mai_features}")

print("\n Feature selection termin√©e!")

 FEATURE SELECTION POUR APPARTEMENTS
 Appartements: 6 ‚Üí 5 features
   ‚Ä¢ sq_mt_built: 45321.89
   ‚Ä¢ n_bathrooms: 18929.79
   ‚Ä¢ n_rooms: 5384.45
   ‚Ä¢ has_lift: 1702.67
   ‚Ä¢ has_parking: 1136.13
 Maisons: 6 features conserv√©es (√©viter overfitting)
   ‚Ä¢ Features: ['sq_mt_built', 'n_bathrooms', 'n_rooms', 'has_garden', 'has_pool', 'neighborhood']

 Feature selection termin√©e!


## 5. Comparaison 3 Algorithmes

In [40]:
print("üè¢ APPARTEMENTS - 3 ALGORITHMES")
print("=" * 50)

# RandomForest
rf_apt = RandomForestRegressor(n_estimators=100, random_state=42)
rf_apt.fit(X_apt_selected, y_apt_train)  # Avec features s√©lectionn√©es
y_apt_pred_rf = rf_apt.predict(X_apt_test_selected)
r2_apt_rf = r2_score(y_apt_test, y_apt_pred_rf)
mae_apt_rf = mean_absolute_error(y_apt_test, y_apt_pred_rf)

# Ridge (avec StandardScaler)
scaler_apt = StandardScaler()
X_apt_train_scaled = scaler_apt.fit_transform(X_apt_selected)
X_apt_test_scaled = scaler_apt.transform(X_apt_test_selected)

ridge_apt = Ridge(alpha=1.0, random_state=42)
ridge_apt.fit(X_apt_train_scaled, y_apt_train)
y_apt_pred_ridge = ridge_apt.predict(X_apt_test_scaled)
r2_apt_ridge = r2_score(y_apt_test, y_apt_pred_ridge)
mae_apt_ridge = mean_absolute_error(y_apt_test, y_apt_pred_ridge)

# Gradient Boosting (nouveau!)
gb_apt = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_apt.fit(X_apt_selected, y_apt_train)
y_apt_pred_gb = gb_apt.predict(X_apt_test_selected)
r2_apt_gb = r2_score(y_apt_test, y_apt_pred_gb)
mae_apt_gb = mean_absolute_error(y_apt_test, y_apt_pred_gb)

# R√©sultats
print(f"RandomForest:     R¬≤ = {r2_apt_rf:.4f}, MAE = {mae_apt_rf:,.0f}‚Ç¨")
print(f"Ridge:           R¬≤ = {r2_apt_ridge:.4f}, MAE = {mae_apt_ridge:,.0f}‚Ç¨")
print(f"GradientBoosting: R¬≤ = {r2_apt_gb:.4f}, MAE = {mae_apt_gb:,.0f}‚Ç¨")

# Meilleur mod√®le
models_apt = {
    'RandomForest': (rf_apt, None, r2_apt_rf, mae_apt_rf),
    'Ridge': (ridge_apt, scaler_apt, r2_apt_ridge, mae_apt_ridge),
    'GradientBoosting': (gb_apt, None, r2_apt_gb, mae_apt_gb)
}

best_name_apt = max(models_apt.keys(), key=lambda k: models_apt[k][2])
best_model_apt, best_scaler_apt, r2_apt_best, mae_apt_best = models_apt[best_name_apt]

print(f"üèÜ Meilleur: {best_name_apt}")
print("=" * 50)

üè¢ APPARTEMENTS - 3 ALGORITHMES
RandomForest:     R¬≤ = 0.7532, MAE = 146,020‚Ç¨
Ridge:           R¬≤ = 0.7233, MAE = 167,215‚Ç¨
GradientBoosting: R¬≤ = 0.7717, MAE = 150,781‚Ç¨
üèÜ Meilleur: GradientBoosting


In [None]:
# FEATURE SELECTION - APPARTEMENTS SEULEMENT (plus de donn√©es = plus robuste)
print("FEATURE SELECTION POUR APPARTEMENTS")

# Features d'entr√©e
apt_features = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'has_lift', 'has_parking', 'has_central_heating']
mai_features = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'has_garden', 'has_pool', 'neighborhood']

# S√©lection K-Best pour appartements seulement
k_best = 5
selector_apt = SelectKBest(score_func=f_regression, k=k_best)
X_apt_selected = selector_apt.fit_transform(X_apt_train[apt_features], y_apt_train)

# R√©cup√©ration des features s√©lectionn√©es
selected_features_apt = [apt_features[i] for i in selector_apt.get_support(indices=True)]
scores_apt = selector_apt.scores_

print(f"Appartements: {len(apt_features)} ‚Üí {len(selected_features_apt)} features")
print("Features s√©lectionn√©es pour appartements:")
for i, (feat, score) in enumerate(zip(selected_features_apt, scores_apt[selector_apt.get_support()])):
    print(f"  {i+1}. {feat}: {score:.2f}")

# Pour les maisons: AUCUNE feature selection (garder tous les signaux)
selected_features_mai = mai_features.copy()
print(f"Maisons: {len(mai_features)} features conserv√©es (√©viter overfitting)")
print("Features conserv√©es pour maisons:", selected_features_mai)

print("\nFeature selection termin√©e!")

 MAISONS - 3 ALGORITHMES (sans feature selection)
RandomForest:     R¬≤ = 0.7951, MAE = 285,420‚Ç¨
Ridge:           R¬≤ = 0.5264, MAE = 577,626‚Ç¨
GradientBoosting: R¬≤ = 0.7837, MAE = 338,849‚Ç¨
üèÜ Meilleur: RandomForest


## 5. Sauvegarde des mod√®les

## 6. Grid Search (Appartements uniquement)

In [None]:
print("MAISONS - 3 ALGORITHMES (sans feature selection)")

# Preparation des donn√©es maisons avec TOUTES les features
X_mai_train_selected = X_mai_train[selected_features_mai]
X_mai_test_selected = X_mai_test[selected_features_mai]

models_mai = {}

# 1. Random Forest (par d√©faut)
rf_mai = RandomForestRegressor(n_estimators=100, random_state=42)
rf_mai.fit(X_mai_train_selected, y_mai_train)
y_mai_pred_rf = rf_mai.predict(X_mai_test_selected)

r2_mai_rf = r2_score(y_mai_test, y_mai_pred_rf)
mae_mai_rf = mean_absolute_error(y_mai_test, y_mai_pred_rf)
models_mai['RandomForest'] = {'model': rf_mai, 'r2': r2_mai_rf, 'mae': mae_mai_rf}

print(f"RandomForest Maisons - R¬≤: {r2_mai_rf:.4f}, MAE: {mae_mai_rf:,.0f}‚Ç¨")

# 2. Ridge avec scaling
scaler_mai = StandardScaler()
X_mai_train_scaled = scaler_mai.fit_transform(X_mai_train_selected)
X_mai_test_scaled = scaler_mai.transform(X_mai_test_selected)

ridge_mai = Ridge(alpha=1.0, random_state=42)
ridge_mai.fit(X_mai_train_scaled, y_mai_train)
y_mai_pred_ridge = ridge_mai.predict(X_mai_test_scaled)

r2_mai_ridge = r2_score(y_mai_test, y_mai_pred_ridge)
mae_mai_ridge = mean_absolute_error(y_mai_test, y_mai_pred_ridge)
models_mai['Ridge'] = {'model': ridge_mai, 'r2': r2_mai_ridge, 'mae': mae_mai_ridge, 'scaler': scaler_mai}

print(f"Ridge Maisons - R¬≤: {r2_mai_ridge:.4f}, MAE: {mae_mai_ridge:,.0f}‚Ç¨")

# 3. Gradient Boosting (par d√©faut)
gb_mai = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_mai.fit(X_mai_train_selected, y_mai_train)
y_mai_pred_gb = gb_mai.predict(X_mai_test_selected)

r2_mai_gb = r2_score(y_mai_test, y_mai_pred_gb)
mae_mai_gb = mean_absolute_error(y_mai_test, y_mai_pred_gb)
models_mai['GradientBoosting'] = {'model': gb_mai, 'r2': r2_mai_gb, 'mae': mae_mai_gb}

print(f"GradientBoosting Maisons - R¬≤: {r2_mai_gb:.4f}, MAE: {mae_mai_gb:,.0f}‚Ç¨")

# S√©lection du meilleur mod√®le
best_name_mai = max(models_mai.keys(), key=lambda k: models_mai[k]['r2'])
best_model_mai = models_mai[best_name_mai]['model']
r2_mai_best = models_mai[best_name_mai]['r2']
mae_mai_best = models_mai[best_name_mai]['mae']
best_scaler_mai = models_mai[best_name_mai].get('scaler', None)

print(f"\nMeilleur mod√®le maisons: {best_name_mai}")
print(f"Performance: R¬≤ = {r2_mai_best:.4f}, MAE = {mae_mai_best:,.0f}‚Ç¨")

 GRID SEARCH OPTIMIZATION - APPARTEMENTS SEULEMENT
üè¢ Optimisation GradientBoosting pour appartements...
Appartements AVANT optimisation: R¬≤ = 0.7717
Appartements APR√àS optimisation: R¬≤ = 0.7781
Meilleurs param√®tres: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}

 Maisons: AUCUNE optimisation (dataset trop petit: 2093 √©chantillons)
   ‚Üí Garde le mod√®le RandomForest par d√©faut pour √©viter l'overfitting
   ‚Üí Performance finale: R¬≤ = 0.7951, MAE = 285,420‚Ç¨

 Optimisation termin√©e !
   üè¢ Appartements: optimis√©s avec Grid Search
    Maisons: mod√®le par d√©faut (anti-overfitting)


In [None]:
print(" R√âENTRA√éNEMENT SUR TOUTES LES DONN√âES")
print("=" * 50)

# Appartements - R√©entra√Ænement sur toutes les donn√©es
rf_apt_final = RandomForestRegressor(n_estimators=100, random_state=42)
rf_apt_final.fit(X_apt, y_apt)  # Toutes les donn√©es !

# Maisons - R√©entra√Ænement sur toutes les donn√©es  
rf_mai_final = RandomForestRegressor(n_estimators=100, random_state=42)
rf_mai_final.fit(X_mai, y_mai)  # Toutes les donn√©es !

print(f" Mod√®les finaux entra√Æn√©s sur 100% des donn√©es")
print(f"   Appartements: {len(X_apt)} √©chantillons")
print(f"   Maisons: {len(X_mai)} √©chantillons")
print("=" * 50)

In [None]:
print("GRID SEARCH OPTIMIZATION - APPARTEMENTS SEULEMENT")

# Configuration Grid Search seulement pour les appartements
param_grid_apt = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7], 
    'learning_rate': [0.01, 0.1, 0.2]
}

# Grid Search avec validation crois√©e (appartements seulement)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_apt = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=param_grid_apt,
    cv=cv,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Entra√Ænement du Grid Search sur les appartements
print("Recherche des meilleurs hyperparam√®tres pour appartements...")
X_apt_selected_features = X_apt_train[selected_features_apt]
grid_apt.fit(X_apt_selected_features, y_apt_train)

# R√©cup√©ration du meilleur mod√®le
best_model_apt_optimized = grid_apt.best_estimator_
print(f"Meilleurs param√®tres: {grid_apt.best_params_}")

# Test du mod√®le optimis√©
X_apt_test_selected = X_apt_test[selected_features_apt]
y_apt_pred_optimized = best_model_apt_optimized.predict(X_apt_test_selected)
r2_apt_optimized = r2_score(y_apt_test, y_apt_pred_optimized)
mae_apt_optimized = mean_absolute_error(y_apt_test, y_apt_pred_optimized)

print(f"Performance optimis√©e appartements: R¬≤ = {r2_apt_optimized:.4f}, MAE = {mae_apt_optimized:,.0f}‚Ç¨")

# Pour les maisons: PAS d'optimisation
print(f"\nMaisons: AUCUNE optimisation (dataset trop petit: {len(X_mai_train)} √©chantillons)")
best_model_mai_optimized = best_model_mai  # Garde le mod√®le par d√©faut
r2_mai_optimized = r2_mai_best
mae_mai_optimized = mae_mai_best

print(f"Performance maisons (par d√©faut): R¬≤ = {r2_mai_optimized:.4f}, MAE = {mae_mai_optimized:,.0f}‚Ç¨")

print("\nOptimisation termin√©e !")
print(f"Appartements: GradientBoosting optimis√© - R¬≤: {r2_apt_optimized:.4f}")
print("   Maisons: mod√®le par d√©faut (anti-overfitting)")

# === R√âENTRA√éNEMENT SUR TOUTES LES DONN√âES ===
print("R√âENTRA√éNEMENT SUR TOUTES LES DONN√âES")

# Appartements: mod√®le optimis√© sur toutes les donn√©es
X_apt_all_selected = X_apt[selected_features_apt]
best_model_apt_optimized.fit(X_apt_all_selected, y_apt)

# Maisons: mod√®le par d√©faut sur toutes les donn√©es
X_mai_all_selected = X_mai[selected_features_mai]
best_model_mai_optimized.fit(X_mai_all_selected, y_mai)

print(f"Mod√®les finaux entra√Æn√©s sur 100% des donn√©es")

# === SAUVEGARDE DES MOD√àLES ===
model_metadata = {
    'appartements': {
        'model_name': 'GradientBoosting_optimized',
        'features': selected_features_apt,
        'r2_score': r2_apt_optimized,
        'mae': mae_apt_optimized,
        'best_params': grid_apt.best_params_,
        'scaler': None  # GradientBoosting ne n√©cessite pas de scaling
    },
    'maisons': {
        'model_name': f'{best_name_mai}_default',
        'features': selected_features_mai,
        'r2_score': r2_mai_optimized,
        'mae': mae_mai_optimized,
        'best_params': None,
        'scaler': best_scaler_mai
    }
}

# Sauvegarde dans le dossier models/
os.makedirs('../models', exist_ok=True)

with open('../models/appartements_model.pkl', 'wb') as f:
    pickle.dump(best_model_apt_optimized, f)

with open('../models/maisons_model.pkl', 'wb') as f:
    pickle.dump(best_model_mai_optimized, f)

# Sauvegarde du scaler pour maisons si n√©cessaire
if best_scaler_mai is not None:
    with open('../models/maisons_scaler.pkl', 'wb') as f:
        pickle.dump(best_scaler_mai, f)

# Sauvegarde des m√©tadonn√©es
with open('../models/model_config.json', 'w') as f:
    import json
    json.dump(model_metadata, f, indent=2)

print("MOD√àLES SAUVEGARD√âS")
print(f"Appartements: GradientBoosting_optimized - R¬≤: {r2_apt_optimized:.4f}, MAE: {mae_apt_optimized:,.0f}‚Ç¨")
print(f"Maisons: {best_name_mai}_default - R¬≤: {r2_mai_best:.4f}, MAE: {mae_mai_best:,.0f}‚Ç¨")

print("\nFichiers sauvegard√©s:")
print("- ../models/appartements_model.pkl")
print("- ../models/maisons_model.pkl") 
print("- ../models/model_config.json")
if best_scaler_mai is not None:
    print("- ../models/maisons_scaler.pkl")

print("Strat√©gie: Optimisation intelligente selon la taille du dataset")

 MOD√àLES SAUVEGARD√âS
üè¢ Appartements: GradientBoosting_optimized - R¬≤: 0.7781, MAE: 147,911‚Ç¨
   ‚Ä¢ Feature Selection: 6 ‚Üí 5 features
   ‚Ä¢ Grid Search: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}

 Maisons: RandomForest_default - R¬≤: 0.7951, MAE: 285,420‚Ç¨
   ‚Ä¢ Feature Selection: AUCUNE (√©viter overfitting)
   ‚Ä¢ Grid Search: AUCUN (dataset trop petit)

 Strat√©gie: Optimisation intelligente selon la taille du dataset
