In [1]:
# ================================================
# 1. Imports
# ================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
import time

In [2]:
### Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8000")

# Create a new MLflow Experiment
mlflow.set_experiment("Experiment Tracking - House Price Prediction")

<Experiment: artifact_location='mlflow-artifacts:/289806203540587791', creation_time=1764829293310, experiment_id='289806203540587791', last_update_time=1764829293310, lifecycle_stage='active', name='Experiment Tracking - House Price Prediction', tags={}>

In [3]:
## Loading the Data

train_df = pd.read_csv(r"../data/processed/train.csv")
eval_df = pd.read_csv(r"../data/processed/eval.csv")

In [4]:
# ================================================
# 3. Define target & features
# ================================================
target = "price"

X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_eval = eval_df.drop(columns=[target])
y_eval = eval_df[target]

print("Train shape:", X_train.shape)
print("Eval shape:", X_eval.shape)

Train shape: (576860, 34)
Eval shape: (148449, 34)


In [5]:
def train_and_track(run_name, model_obj):
    """
    Trains a model, tracks time, and logs everything to MLflow.
    """
    print(f"\n STARTING: {run_name}")
    
    with mlflow.start_run(run_name=run_name):
        # 1. Setup Pipeline
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('regressor', model_obj)
        ])
        
        final_model = TransformedTargetRegressor(
            regressor=pipeline,
            func=np.log1p,
            inverse_func=np.expm1
        )
        
        # 2. Train & Time it
        start_time = time.time()
        final_model.fit(X_train, y_train)
        end_time = time.time()
        
        duration = end_time - start_time
        print(f" Training Time: {duration:.2f} seconds ({duration/60:.2f} minutes)")
        
        # 3. Evaluate
        preds = final_model.predict(X_eval)
        rmse = np.sqrt(mean_squared_error(y_eval, preds))
        r2 = r2_score(y_eval, preds)
        
        print(f"üìä RMSE: {rmse:,.0f} | R2: {r2:.4f}")
        
        # 4. Log to MLflow
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)
        mlflow.log_metric("training_time_seconds", duration)
        
        # Log Params safely
        if hasattr(model_obj, "n_estimators"):
            mlflow.log_param("n_estimators", model_obj.n_estimators)
        if hasattr(model_obj, "learning_rate"):
            mlflow.log_param("learning_rate", model_obj.learning_rate)
        if hasattr(model_obj, "max_depth"):
            mlflow.log_param("max_depth", model_obj.max_depth)

        
    print(f"FINISHED: {run_name}")

In [8]:
# ==========================================
# 3. MODEL 1: Random Forest
# ==========================================
rf_model = RandomForestRegressor(
    n_estimators=100, 
    max_depth=20, 
    max_samples=0.5, 
    random_state=42, 
    n_jobs=-1
)

train_and_track("RF_Optimized", rf_model)


 STARTING: RF_Optimized
 Training Time: 176.88 seconds (2.95 minutes)
üìä RMSE: 74,203 | R2: 0.9574




üèÉ View run RF_Optimized at: http://127.0.0.1:8000/#/experiments/289806203540587791/runs/30d866c768124ff5b9e17c9baac6ce6c
üß™ View experiment at: http://127.0.0.1:8000/#/experiments/289806203540587791
FINISHED: RF_Optimized


In [None]:
# ==========================================
# 4. MODEL 2: XGBoost Baseline
# ==========================================
xgb_base = XGBRegressor(
    random_state=42, 
    n_jobs=-1
)

train_and_track("XGB_Baseline", xgb_base)


 STARTING: XGB_Baseline
 Training Time: 5.56 seconds (0.09 minutes)
üìä RMSE: 82,582 | R2: 0.9473
üèÉ View run XGB_Baseline at: http://127.0.0.1:8000/#/experiments/289806203540587791/runs/66ce91d2a1bf48c8866e3c2292195ef1
üß™ View experiment at: http://127.0.0.1:8000/#/experiments/289806203540587791
FINISHED: XGB_Baseline


In [None]:
# ==========================================
# 5. MODEL 3: XGBoost Tuned
# ==========================================

xgb_tuned = XGBRegressor(
    n_estimators=500,  # Lowered from 1000 to be safer
    learning_rate=0.05, 
    max_depth=6, 
    random_state=42, 
    n_jobs=-1
)

train_and_track("XGB_Tuned_500", xgb_tuned)


 STARTING: XGB_Tuned_500
 Training Time: 13.73 seconds (0.23 minutes)
üìä RMSE: 75,922 | R2: 0.9555
üèÉ View run XGB_Tuned_500 at: http://127.0.0.1:8000/#/experiments/289806203540587791/runs/6556f4d287294fc2bcb61b4c54fa8327
üß™ View experiment at: http://127.0.0.1:8000/#/experiments/289806203540587791
FINISHED: XGB_Tuned_500
