In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from mlflow.models import infer_signature

In [None]:
# ==========================================
# 1. SETUP
# ==========================================
mlflow.set_tracking_uri(uri="http://127.0.0.1:8000")
mlflow.set_experiment("Experiment Tracking - House Price Prediction")

In [None]:
## Loading the Data

train_df = pd.read_csv(r"../data/processed/train.csv")
eval_df = pd.read_csv(r"../data/processed/eval.csv")
holdout_df = pd.read_csv(r"../data/processed/holdout.csv") 

In [None]:
# Define Feature Selection
target = "price"
# We drop the target AND the raw identifiers (date, text columns)
# We keep 'year' out of training to avoid overfitting to specific timelines
drop_cols = [target, "date", "year", "city", "state_id", "zipcode"]

# Prepare X and y (Filtering for numbers only)
X_train = train_df.drop(columns=drop_cols, errors='ignore').select_dtypes(include=[np.number])
y_train = train_df[target]

X_eval = eval_df.drop(columns=drop_cols, errors='ignore').select_dtypes(include=[np.number])
y_eval = eval_df[target]

X_test = holdout_df.drop(columns=drop_cols, errors='ignore').select_dtypes(include=[np.number])
y_test = holdout_df[target]

# CRITICAL: For final production model, we learn from ALL history (Train + Eval)
X_full_train = pd.concat([X_train, X_eval]).reset_index(drop=True)
y_full_train = pd.concat([y_train, y_eval]).reset_index(drop=True)

print(f"Training on {len(X_full_train)} rows (Train+Eval).")
print(f"Testing on {len(X_test)} rows (Holdout).")

# ==========================================
# 2. THE CHAMPION PARAMETERS (Restored)
# ==========================================
best_params = {
    'n_estimators': 766,
    'max_depth': 8,
    'learning_rate': 0.056659883160228804,
    'subsample': 0.6028207008279798,
    'colsample_bytree': 0.8388551767066131,
    'min_child_weight': 1,
    'reg_alpha': 2.716993750938802e-05,
    'reg_lambda': 8.658796440498847,
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': "hist"
}

# ==========================================
# 3. TRAIN & LOG CHAMPION
# ==========================================
with mlflow.start_run(run_name="Champion_XGBoost_Final"):
    
    print("ðŸš€ Training Final Production Model...")
    
    # Pipeline
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('regressor', XGBRegressor(**best_params))
    ])
    
    # Log-Transform Wrapper
    final_model = TransformedTargetRegressor(
        regressor=pipeline,
        func=np.log1p,
        inverse_func=np.expm1
    )
    
    # Train on Full History
    final_model.fit(X_full_train, y_full_train)
    
    # Evaluate on Holdout (Final Test)
    preds = final_model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    
    print(f"\nðŸ“Š FINAL HOLDOUT SCORES:")
    print(f"   RMSE: ${rmse:,.0f}")
    print(f"   MAE:  ${mae:,.0f}")
    print(f"   R2:   {r2:.4f}")
    
    # Log to MLflow
    mlflow.log_params(best_params)
    mlflow.log_metric("holdout_rmse", rmse)
    mlflow.log_metric("holdout_r2", r2)
    
    # Save Model
    signature = infer_signature(X_test, preds)
    mlflow.sklearn.log_model(final_model, "champion_model", signature=signature)
    
    print("\nâœ… Champion Model Saved to MLflow.")