# Phase 2: Model Training and Experimentation

This notebook will focus on training and optimizing our model. Our objectives include
1. Experiment tracking with MLflow
2. Cross-validation
3. Hyperparameter optimization
4. Model evaluation and selection
5. Model versioning and registration

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import joblib
import optuna
import xgboost as xgb
import lightgbm as lgb
from pathlib import Path

# import custom transformers
import os, sys
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from models.custom_transformers import FeatureEngineer, LogTransformer

# Configure MLflow
EXPERIMENT_NAME = "ames-housing-price-prediction"
mlflow.set_experiment(EXPERIMENT_NAME)

# Load the preprocessor
# if these errors out, try running phase1_eda.ipynb first
models_dir = "../models"
feature_preprocessor = joblib.load(f'{models_dir}/feature_preprocessor.joblib')
target_transformer = joblib.load(f'{models_dir}/target_transformer.joblib')

print("Loading data...")
train_data = pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)

  from .autonotebook import tqdm as notebook_tqdm


Loading data...
Train shape: (1460, 81)
Test shape: (1459, 80)


In [2]:
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

# apply pipelines/transformations
X_processed = feature_preprocessor.transform(X)
y_processed = target_transformer.transform(y)

# Split data into train and validation sets
# use _val to prevent confusion between the test dataset
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y_processed, 
    test_size=0.2, 
    random_state=42
)

# Apply preprocessing
print("Applying preprocessing pipeline...")
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

Applying preprocessing pipeline...
X_train shape: (1168, 242)
X_val shape: (292, 242)


## Baseline Model Development

Let's start with a simple baseline model using XGBoost with default parameters. This will give us a reference point for further improvements.

In [3]:
def evaluate_model(model, X, y, prefix=''):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    return {
        f'{prefix}rmse': rmse,
        f'{prefix}mae': mae,
        f'{prefix}r2': r2
    }

In [4]:
# for training baseline XGBoost model
with mlflow.start_run(run_name="xgboost-baseline"):
    # create model with default parameters
    model = xgb.XGBRegressor(
        random_state=42,
        n_jobs=-1
    )
    
    # train model
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_val, y_val)],
        verbose=100
    )
    
    train_metrics = evaluate_model(model, X_train, y_train, prefix='train_')
    val_metrics = evaluate_model(model, X_val, y_val, prefix='val_')
    
    mlflow.log_params(model.get_params())
    mlflow.log_metrics({**train_metrics, **val_metrics})
    mlflow.xgboost.log_model(model, "model")
    
    print("\nTraining Metrics:")
    for metric, value in train_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print("\nValidation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")

# Save the baseline model
joblib.dump(model, f'{models_dir}/baseline_xgboost.joblib')
print(f"\nBaseline model saved to {models_dir}/baseline_xgboost.joblib")

[0]	validation_0-rmse:0.33810
[99]	validation_0-rmse:0.15271


  self.get_booster().save_model(fname)



Training Metrics:
train_rmse: 0.0076
train_mae: 0.0055
train_r2: 0.9996

Validation Metrics:
val_rmse: 0.1527
val_mae: 0.1064
val_r2: 0.8750

Baseline model saved to ../models/baseline_xgboost.joblib


## Hyperparameter Optimization with Optuna

Now that we have a baseline model, let's use Optuna to find better hyperparameters for our XGBoost model. We'll define an objective function that Optuna will optimize using cross-validation scores.

In [5]:
# objective function for Optuna
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'random_state': 42
    }
    
    # get model with suggested parameters
    model = xgb.XGBRegressor(**params)
    
    # cross-validation
    cv_scores = cross_val_score(
        model, 
        X_train, 
        y_train, 
        cv=5, 
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )
    
    # mean negative RMSE (Optuna minimizes objective)
    return -cv_scores.mean()

# Create and run Optuna study
study = optuna.create_study(direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-10-30 16:23:27,701] A new study created in memory with name: no-name-cafbf307-a3bc-403a-a313-acd705a8e278
Best trial: 20. Best value: 0.124841: 100%|██████████| 50/50 [00:28<00:00,  1.78it/s]

Best trial:
  Value:  0.12484070609617366
  Params: 
    max_depth: 4
    learning_rate: 0.047588753062449786
    n_estimators: 355
    min_child_weight: 2
    subsample: 0.84936860390881
    colsample_bytree: 0.6186138763393498
    reg_alpha: 1.8046864107985947e-05
    reg_lambda: 1.4720283701765382e-07





In [6]:
# for training the optimized model
with mlflow.start_run(run_name="xgboost-optimized"):
    # get model with best parameters
    best_params = study.best_params
    best_params['random_state'] = 42
    model = xgb.XGBRegressor(**best_params)
    
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_val, y_val)],
        verbose=100
    )
    
    train_metrics = evaluate_model(model, X_train, y_train, prefix='train_')
    val_metrics = evaluate_model(model, X_val, y_val, prefix='val_')
    
    mlflow.log_params(model.get_params())
    mlflow.log_metrics({**train_metrics, **val_metrics})
    mlflow.xgboost.log_model(model, name="model")
    
    print("\nTraining Metrics:")
    for metric, value in train_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print("\nValidation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")

# save optimized model
joblib.dump(model, '../models/optimized_xgboost.joblib')
print("\nOptimized model saved to ../models/optimized_xgboost.joblib")

[0]	validation_0-rmse:0.41864
[100]	validation_0-rmse:0.14093
[200]	validation_0-rmse:0.13675
[300]	validation_0-rmse:0.13689
[354]	validation_0-rmse:0.13676


  self.get_booster().save_model(fname)



Training Metrics:
train_rmse: 0.0459
train_mae: 0.0343
train_r2: 0.9862

Validation Metrics:
val_rmse: 0.1368
val_mae: 0.0911
val_r2: 0.8998

Optimized model saved to ../models/optimized_xgboost.joblib


## Model Comparison

Let's compare the performance of our baseline and optimized models to see the improvement from hyperparameter optimization.

In [7]:
baseline_model = joblib.load('../models/baseline_xgboost.joblib')

# compare models on validation set
baseline_metrics = evaluate_model(baseline_model, X_val, y_val)
optimized_metrics = evaluate_model(model, X_val, y_val)

print("Baseline Model Metrics:")
for metric, value in baseline_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nOptimized Model Metrics:")
for metric, value in optimized_metrics.items():
    print(f"{metric}: {value:.4f}")

improvement = (baseline_metrics['rmse'] - optimized_metrics['rmse']) / baseline_metrics['rmse'] * 100
print(f"\nRMSE Improvement: {improvement:.2f}%")

Baseline Model Metrics:
rmse: 0.1527
mae: 0.1064
r2: 0.8750

Optimized Model Metrics:
rmse: 0.1368
mae: 0.0911
r2: 0.8998

RMSE Improvement: 10.45%


## Generate Predictions for Test Set

Finally, we can use our optimized model to generate predictions for the test set. We'll transform the predictions back to the original scale and create a submission file which can be uploaded to Kaggle.

In [8]:
# make predictions on test data
X_test_processed = feature_preprocessor.transform(test_data)
y_test_transformed = model.predict(X_test_processed)

# Transform predictions back to original scale
y_test = target_transformer.inverse_transform(y_test_transformed)

submission = pd.DataFrame({
    'Id': test_data.Id,
    'SalePrice': y_test
})

# save submission file
submission_path = '../submissions/xgboost_submission.csv'
Path('../submissions').mkdir(exist_ok=True)
submission.to_csv(submission_path, index=False)
print(f"Submission file saved to {submission_path}")

print("\nFirst few predictions:")
print(submission.head())

Submission file saved to ../submissions/xgboost_submission.csv

First few predictions:
     Id      SalePrice
0  1461  122708.265625
1  1462  162753.484375
2  1463  183156.562500
3  1464  195766.843750
4  1465  186826.921875


