# Entrainement model Lille avec ml flow 

In [52]:
# Installation des d√©pendances si n√©cessaire
# !pip install mlflow scikit-learn pandas numpy xgboost

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import joblib

print("Imports r√©ussis")

Imports r√©ussis


## Configuration de mlflow 

In [53]:
# Configuration du tracking URI pour le registry distant
mlflow.set_tracking_uri("file:./mlruns")

# Nom de l'exp√©rience
mlflow.set_experiment("immoprice_lille_experiment")

print(" MLflow configur√©")
print(f"   Tracking URI: {mlflow.get_tracking_uri()}")
print(f"   Exp√©rience: immoprice_lille_experiment")

 MLflow configur√©
   Tracking URI: file:./mlruns
   Exp√©rience: immoprice_lille_experiment


## Chargement des donn√©es 

In [54]:
import pandas as pd

print(" Chargement des donn√©es France 2024...")
df_full = pd.read_csv("../data/Valeurs_foncieres-2024.txt", sep="|", low_memory=False)

print(f" Total transactions France : {len(df_full)}")

# Filtrer le d√©partement Nord (59)
df_nord = df_full[df_full['Code departement'] == '59'].copy()
print(f"Transactions d√©partement Nord : {len(df_nord)}")

# Trouver Lille
print(f"\n Recherche de Lille...")
df_lille_search = df_nord[df_nord['Commune'].str.contains('LILLE', case=False, na=False)]

if len(df_lille_search) > 0:
    print(f"\n Villes trouv√©es contenant 'LILLE' :")
    print(df_lille_search.groupby(['Code commune', 'Commune']).size())
    
    # Prendre le code commune de Lille
    code_lille = df_lille_search['Code commune'].mode()[0]
    print(f"\n Code commune Lille : {code_lille}")
    
    # Filtrer avec le bon code
    df_lille = df_nord[df_nord['Code commune'] == code_lille].copy()
    print(f"Transactions Lille 2024 : {len(df_lille)}")
    
    # Sauvegarder
    df_lille.to_csv("../data/lille_2024.csv", index=False)
    print("Fichier lille_2024.csv cr√©√©")
else:
    print(" Lille non trouv√© dans les donn√©es")

 Chargement des donn√©es France 2024...
 Total transactions France : 3489149
Transactions d√©partement Nord : 95099

 Recherche de Lille...

 Villes trouv√©es contenant 'LILLE' :
Code commune  Commune            
350           LILLE                  8356
386           MARQUETTE-LEZ-LILLE     289
dtype: int64

 Code commune Lille : 350
Transactions Lille 2024 : 8356
Fichier lille_2024.csv cr√©√©


##  Chargement et exploration des donn√©es

In [55]:
def load_and_explore_data():
    """Charger les donn√©es de Lille"""
    print(" Chargement des donn√©es de Lille...")
    
    df = pd.read_csv("../data/lille_2024.csv")
    print(f" Donn√©es charg√©es : {len(df)} transactions")
    
    # Explorer les colonnes disponibles
    print("\n Colonnes disponibles :")
    for i, col in enumerate(df.columns, 1):
        print(f"  {i:2d}. {col}")
    
    # Types de logements
    if 'Type local' in df.columns:
        print(f"\n  Types de logements :")
        print(df['Type local'].value_counts())
    
    # Nombres de pi√®ces
    if 'Nombre pieces principales' in df.columns:
        print(f"\n R√©partition nombre de pi√®ces :")
        print(df['Nombre pieces principales'].value_counts().sort_index())
    
    return df

# Chargement
df = load_and_explore_data()

 Chargement des donn√©es de Lille...
 Donn√©es charg√©es : 8356 transactions

 Colonnes disponibles :
   1. Identifiant de document
   2. Reference document
   3. 1 Articles CGI
   4. 2 Articles CGI
   5. 3 Articles CGI
   6. 4 Articles CGI
   7. 5 Articles CGI
   8. No disposition
   9. Date mutation
  10. Nature mutation
  11. Valeur fonciere
  12. No voie
  13. B/T/Q
  14. Type de voie
  15. Code voie
  16. Voie
  17. Code postal
  18. Commune
  19. Code departement
  20. Code commune
  21. Prefixe de section
  22. Section
  23. No plan
  24. No Volume
  25. 1er lot
  26. Surface Carrez du 1er lot
  27. 2eme lot
  28. Surface Carrez du 2eme lot
  29. 3eme lot
  30. Surface Carrez du 3eme lot
  31. 4eme lot
  32. Surface Carrez du 4eme lot
  33. 5eme lot
  34. Surface Carrez du 5eme lot
  35. Nombre de lots
  36. Code type local
  37. Type local
  38. Identifiant local
  39. Surface reelle bati
  40. Nombre pieces principales
  41. Nature culture
  42. Nature culture speciale
  43. S

## Filtrage des logements 4 pieces

In [56]:
def filter_4_pieces_data(df):
    """Filtrer les logements de 4 pi√®ces"""
    print("\n Filtrage des logements 4 pi√®ces...")
    
    # Filtrer les 4 pi√®ces
    df_4p = df[df['Nombre pieces principales'] == 4.0].copy()
    print(f"Logements 4 pi√®ces : {len(df_4p)} transactions")
    
    if len(df_4p) == 0:
        print(" Aucun logement 4 pi√®ces trouv√© !")
        return None
    
    # Types de logements 4 pi√®ces
    print(f"\n Types de logements 4 pi√®ces :")
    print(df_4p['Type local'].value_counts())
    
    return df_4p

# Filtrage
df_4p = filter_4_pieces_data(df)

# S√©paration appartements / maisons
appartements = df_4p[df_4p['Type local'] == 'Appartement'].copy()
maisons = df_4p[df_4p['Type local'] == 'Maison'].copy()

print(f"\nAppartements 4 pi√®ces : {len(appartements)}")
print(f" Maisons 4 pi√®ces : {len(maisons)}")


 Filtrage des logements 4 pi√®ces...
Logements 4 pi√®ces : 551 transactions

 Types de logements 4 pi√®ces :
Type local
Appartement    280
Maison         271
Name: count, dtype: int64

Appartements 4 pi√®ces : 280
 Maisons 4 pi√®ces : 271


## Nettoyage des donn√©es 

In [57]:
def select_features_and_clean(df, dataset_name):
    """S√©lectionner les colonnes et nettoyer"""
    print(f"\nüßπ Nettoyage donn√©es {dataset_name}...")
    
    if len(df) == 0:
        print(f" Aucune donn√©e pour {dataset_name}")
        return None, None, None
    
    # Colonnes √† conserver
    required_cols = ['Surface reelle bati', 'Nombre pieces principales', 'Type local', 'Valeur fonciere']
    optional_cols = ['Surface terrain', 'Nombre de lots']
    
    available_cols = [col for col in required_cols if col in df.columns]
    available_optional = [col for col in optional_cols if col in df.columns]
    
    selected_cols = available_cols + available_optional
    df_clean = df[selected_cols].copy()
    
    # CONVERTIR LES COLONNES NUM√âRIQUES (virgules ‚Üí points)
    numeric_cols = ['Surface reelle bati', 'Valeur fonciere', 'Surface terrain', 'Nombre de lots', 'Nombre pieces principales']
    for col in numeric_cols:
        if col in df_clean.columns:
            # Remplacer virgule par point et convertir en float
            df_clean[col] = df_clean[col].astype(str).str.replace(',', '.').replace('', '0')
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Cr√©er prix_m2
    df_clean['prix_m2'] = df_clean['Valeur fonciere'] / df_clean['Surface reelle bati']
    
    print(f"   Avant nettoyage : {len(df_clean)} lignes")
    
    # Remplacer valeurs manquantes
    if 'Surface terrain' in df_clean.columns:
        df_clean['Surface terrain'] = df_clean['Surface terrain'].fillna(0)
    if 'Nombre de lots' in df_clean.columns:
        df_clean['Nombre de lots'] = df_clean['Nombre de lots'].fillna(1)
    
    # Supprimer lignes avec donn√©es essentielles manquantes
    df_clean = df_clean.dropna(subset=['Surface reelle bati', 'Valeur fonciere', 'prix_m2'])
    
    # Retirer outliers (IQR)
    if len(df_clean) > 0:
        Q1 = df_clean['prix_m2'].quantile(0.25)
        Q3 = df_clean['prix_m2'].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        mask = (df_clean['prix_m2'] >= lower_bound) & (df_clean['prix_m2'] <= upper_bound)
        df_clean = df_clean[mask]
    
    print(f"   Apr√®s nettoyage : {len(df_clean)} lignes")
    
    # Pr√©parer X et y
    feature_cols = [col for col in df_clean.columns if col not in ['Valeur fonciere', 'prix_m2', 'Type local']]
    X = df_clean[feature_cols]
    y = df_clean['prix_m2']
    
    print(f"   Prix m¬≤ moyen: {df_clean['prix_m2'].mean():.2f} ‚Ç¨")
    print(f"   Variables : {list(X.columns)}")
    
    return X, y, df_clean

# Nettoyage
print("\n" + "="*50)
print("üßπ NETTOYAGE APPARTEMENTS")
print("="*50)
X_apt, y_apt, df_apt_clean = select_features_and_clean(appartements, "APPARTEMENTS")

print("\n" + "="*50)
print("üßπ NETTOYAGE MAISONS")
print("="*50)
X_maisons, y_maisons, df_maisons_clean = select_features_and_clean(maisons, "MAISONS")


üßπ NETTOYAGE APPARTEMENTS

üßπ Nettoyage donn√©es APPARTEMENTS...
   Avant nettoyage : 280 lignes
   Apr√®s nettoyage : 264 lignes
   Prix m¬≤ moyen: 3130.19 ‚Ç¨
   Variables : ['Surface reelle bati', 'Nombre pieces principales', 'Surface terrain', 'Nombre de lots']

üßπ NETTOYAGE MAISONS

üßπ Nettoyage donn√©es MAISONS...
   Avant nettoyage : 271 lignes
   Apr√®s nettoyage : 251 lignes
   Prix m¬≤ moyen: 2894.53 ‚Ç¨
   Variables : ['Surface reelle bati', 'Nombre pieces principales', 'Surface terrain', 'Nombre de lots']


## Division train/test 

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Division train/test
if X_apt is not None:
    X_train_apt, X_test_apt, y_train_apt, y_test_apt = train_test_split(
        X_apt, y_apt, test_size=0.2, random_state=42
    )
    print(f"Appartements - Train: {len(X_train_apt)}, Test: {len(X_test_apt)}")

if X_maisons is not None:
    X_train_maisons, X_test_maisons, y_train_maisons, y_test_maisons = train_test_split(
        X_maisons, y_maisons, test_size=0.2)

Appartements - Train: 211, Test: 53


## Standardisation 

In [59]:
# Standardisation
if X_train_apt is not None:
    scaler_apt = StandardScaler()
    X_train_apt_scaled = scaler_apt.fit_transform(X_train_apt)
    X_test_apt_scaled = scaler_apt.transform(X_test_apt)
    print("Scaler appartements cree")

if X_train_maisons is not None:
    scaler_maisons = StandardScaler()
    X_train_maisons_scaled = scaler_maisons.fit_transform(X_train_maisons)
    X_test_maisons_scaled = scaler_maisons.transform(X_test_maisons)
    print("Scaler maisons cree")

Scaler appartements cree
Scaler maisons cree


## Entrainement appartements avec MlFlow

In [60]:
print("\n" + "="*60)
print("ENTRAINEMENT APPARTEMENTS AVEC MLFLOW")
print("="*60)


    # Demarrage du run MLflow
with mlflow.start_run(run_name="appartements_random_forest_lille_2024") as run:
        
        print("\nLogging parametres...")
        # Log des parametres du dataset
        mlflow.log_param("dataset", "appartements_lille_4pieces_2024")
        mlflow.log_param("n_samples_train", len(X_train_apt))
        mlflow.log_param("n_samples_test", len(X_test_apt))
        mlflow.log_param("n_features", X_train_apt.shape[1])
        mlflow.log_param("test_size", 0.2)
        
        # Parametres GridSearch
        rf_params = {
            'n_estimators': [50, 100],
            'max_depth': [10, 15, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
        
        mlflow.log_params({f"grid_{k}": str(v) for k, v in rf_params.items()})
        
        print("GridSearchCV en cours...")
        grid_rf_apt = GridSearchCV(
            RandomForestRegressor(random_state=42, n_jobs=-1),
            rf_params,
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        grid_rf_apt.fit(X_train_apt_scaled, y_train_apt)
        
        # Meilleur modele
        best_model_apt = grid_rf_apt.best_estimator_
        
        # Predictions
        y_pred_train_apt = best_model_apt.predict(X_train_apt_scaled)
        y_pred_test_apt = best_model_apt.predict(X_test_apt_scaled)
        
        # Metriques
        train_mse_apt = mean_squared_error(y_train_apt, y_pred_train_apt)
        test_mse_apt = mean_squared_error(y_test_apt, y_pred_test_apt)
        train_rmse_apt = np.sqrt(train_mse_apt)
        test_rmse_apt = np.sqrt(test_mse_apt)
        train_r2_apt = r2_score(y_train_apt, y_pred_train_apt)
        test_r2_apt = r2_score(y_test_apt, y_pred_test_apt)
        
        print("\nLogging metriques...")
        # Log metriques
        mlflow.log_metric("train_mse", train_mse_apt)
        mlflow.log_metric("test_mse", test_mse_apt)
        mlflow.log_metric("train_rmse", train_rmse_apt)
        mlflow.log_metric("test_rmse", test_rmse_apt)
        mlflow.log_metric("train_r2", train_r2_apt)
        mlflow.log_metric("test_r2", test_r2_apt)
        
        # Log meilleurs parametres
        mlflow.log_params({f"best_{k}": v for k, v in grid_rf_apt.best_params_.items()})
        
        print("\nSauvegarde du modele...")
        # Enregistrement du modele dans MLflow
        mlflow.sklearn.log_model(
            best_model_apt,
            "model",
            registered_model_name="immoprice_appartements_lille"
        )
        
        # Sauvegarde du scaler
        mlflow.sklearn.log_model(
            scaler_apt,
            "scaler"
        )
        
        # Affichage resultats
        print("\n" + "="*60)
        print("RESULTATS APPARTEMENTS")
        print("="*60)
        print(f"Meilleurs parametres: {grid_rf_apt.best_params_}")
        print(f"Train RMSE: {train_rmse_apt:>12,.2f} EUR/m2")
        print(f"Test RMSE:  {test_rmse_apt:>12,.2f} EUR/m2")
        print(f"Train R2:   {train_r2_apt:>12.3f}")
        print(f"Test R2:    {test_r2_apt:>12.3f}")
        print(f"\nRun ID: {run.info.run_id}")
        print(f"Model URI: runs:/{run.info.run_id}/model")
        
        # Sauvegarder Run ID
        apt_run_id = run.info.run_id


ENTRAINEMENT APPARTEMENTS AVEC MLFLOW

Logging parametres...
GridSearchCV en cours...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Logging metriques...

Sauvegarde du modele...


Registered model 'immoprice_appartements_lille' already exists. Creating a new version of this model...
Created version '2' of model 'immoprice_appartements_lille'.



RESULTATS APPARTEMENTS
Meilleurs parametres: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Train RMSE:       748.87 EUR/m2
Test RMSE:      1,022.54 EUR/m2
Train R2:          0.447
Test R2:          -0.405

Run ID: cc34f04d13ee4fd3b984a7d16ad7d919
Model URI: runs:/cc34f04d13ee4fd3b984a7d16ad7d919/model


## Entrainement maison avec MlFlow 

In [None]:
print("\n" + "="*60)
print("ENTRAINEMENT MAISONS AVEC MLFLOW")
print("="*60)

# D√©marage du run mlflow
with mlflow.start_run(run_name="maisons_random_forest_lille_2024") as run:
    # Logs des param√®tres du dataset
    print("\nLogging parametres...")
    mlflow.log_param("dataset", "maisons_lille_4pieces_2024")
    mlflow.log_param("n_samples_train", len(X_train_maisons))
    mlflow.log_param("n_samples_test", len(X_test_maisons))
    mlflow.log_param("n_features", X_train_maisons.shape[1])
    mlflow.log_param("test_size", 0.2)

    # Logs des param√®tres grid search 
    rf_params = {
        'n_estimators': [50, 100],
        'max_depth': [10, 15, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    mlflow.log_params({f"grid_{k}": str(v) for k, v in rf_params.items()})
    
    print("GridSearchCV en cours...")
    grid_rf_maisons = GridSearchCV(
        RandomForestRegressor(random_state=42, n_jobs=-1),
        rf_params,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    
    grid_rf_maisons.fit(X_train_maisons_scaled, y_train_maisons)
    
    # Meilleur mod√®l
    best_model_maisons = grid_rf_maisons.best_estimator_
    
    # Pr√©dictions
    y_pred_train_maisons = best_model_maisons.predict(X_train_maisons_scaled)
    y_pred_test_maisons = best_model_maisons.predict(X_test_maisons_scaled)
    
    # Log m√©triques 
    train_mse_maisons = mean_squared_error(y_train_maisons, y_pred_train_maisons)
    test_mse_maisons = mean_squared_error(y_test_maisons, y_pred_test_maisons)
    train_rmse_maisons = np.sqrt(train_mse_maisons)
    test_rmse_maisons = np.sqrt(test_mse_maisons)
    train_r2_maisons = r2_score(y_train_maisons, y_pred_train_maisons)
    test_r2_maisons = r2_score(y_test_maisons, y_pred_test_maisons)
    
    #Log m√©triques 
    print("\nLogging metriques...")
    mlflow.log_metric("train_mse", train_mse_maisons)
    mlflow.log_metric("test_mse", test_mse_maisons)
    mlflow.log_metric("train_rmse", train_rmse_maisons)
    mlflow.log_metric("test_rmse", test_rmse_maisons)
    mlflow.log_metric("train_r2", train_r2_maisons)
    mlflow.log_metric("test_r2", test_r2_maisons)
    
    #Meilleur Param√®tres 
    mlflow.log_params({f"best_{k}": v for k, v in grid_rf_maisons.best_params_.items()})
    
    # Enregistrments  du mod√®le dans MlFlow
    print("\nSauvegarde du modele...")
    mlflow.sklearn.log_model(
        best_model_maisons,
        "model",
        registered_model_name="immoprice_maisons_lille"
    )
    
    # Sauvegarde du scaler 
    mlflow.sklearn.log_model(
        scaler_maisons,
        "scaler"
    )
    
    print("\n" + "="*60)
    print("RESULTATS MAISONS")
    print("="*60)
    print(f"Meilleurs parametres: {grid_rf_maisons.best_params_}")
    print(f"Train RMSE: {train_rmse_maisons:>12,.2f} EUR/m2")
    print(f"Test RMSE:  {test_rmse_maisons:>12,.2f} EUR/m2")
    print(f"Train R2:   {train_r2_maisons:>12.3f}")
    print(f"Test R2:    {test_r2_maisons:>12.3f}")
    print(f"\nRun ID: {run.info.run_id}")
    print(f"Model URI: runs:/{run.info.run_id}/model")
    
    maisons_run_id = run.info.run_id


ENTRAINEMENT MAISONS AVEC MLFLOW

Logging parametres...
GridSearchCV en cours...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Logging metriques...

Sauvegarde du modele...


Successfully registered model 'immoprice_maisons_lille'.
Created version '1' of model 'immoprice_maisons_lille'.



RESULTATS MAISONS
Meilleurs parametres: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Train RMSE:       557.60 EUR/m2
Test RMSE:        876.63 EUR/m2
Train R2:          0.560
Test R2:          -0.023

Run ID: a414d25447634108971c44dbf5e80c93
Model URI: runs:/a414d25447634108971c44dbf5e80c93/model


Exception ignored in: <function ResourceTracker.__del__ at 0x108a93f60>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.12/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.12/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.12/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x108907f60>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.12/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.12/Frameworks/Python.framework/

In [67]:
# Pointer vers notebooks/mlruns
mlflow.set_tracking_uri("file:./mlruns")  # Au lieu de ../mlruns

client = mlflow.tracking.MlflowClient()

# Lister les runs de l'exp√©rience 273489206835104918
runs = client.search_runs(experiment_ids=["273489206835104918"])

print(f"Nombre de runs : {len(runs)}")
print("\nRuns disponibles :")
print("="*80)

for run in runs:
    print(f"\nRun ID: {run.info.run_id}")
    print(f"Run Name: {run.data.tags.get('mlflow.runName', 'N/A')}")
    print(f"Dataset: {run.data.params.get('dataset', 'N/A')}")
    test_rmse = run.data.metrics.get('test_rmse')
    print(f"Test RMSE: {test_rmse:.2f}" if test_rmse else "Test RMSE: N/A")
    print("-"*80)

Nombre de runs : 5

Runs disponibles :

Run ID: a414d25447634108971c44dbf5e80c93
Run Name: maisons_random_forest_lille_2024
Dataset: maisons_lille_4pieces_2024
Test RMSE: 876.63
--------------------------------------------------------------------------------

Run ID: cc34f04d13ee4fd3b984a7d16ad7d919
Run Name: appartements_random_forest_lille_2024
Dataset: appartements_lille_4pieces_2024
Test RMSE: 1022.54
--------------------------------------------------------------------------------

Run ID: 115dd16d4ff540b39509084da7b2a475
Run Name: appartements_random_forest_lille_2024
Dataset: appartements_lille_4pieces_2024
Test RMSE: 1022.54
--------------------------------------------------------------------------------

Run ID: d4a9e87b8c0a40839f385f783772ead7
Run Name: appartements_random_forest_lille_2024
Dataset: appartements_lille_4pieces_2024
Test RMSE: N/A
--------------------------------------------------------------------------------

Run ID: cb18af1224ae4613ab3d9ca4ceaaace1
Run Name: 