In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from typing import Any, Dict, List, Optional, Tuple

In [2]:
random_seed = 42

In [3]:
# Generate a dataset with 250000 samples, 20 features, and some noise
dataset, target = make_regression(n_samples=250000, n_features=20, n_informative=15, noise=0.1, random_state=random_seed,
                                  bias=5.0)
X = pd.DataFrame(dataset)
X.columns = X.columns.astype(str)
y = target

In [4]:
def create_train_validation_test_sets(
    X: pd.DataFrame, y: np.array
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
    """
    Split the dataset into training, valiidation and testing sets.
    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Labels.
        test_size (float): Proportion of the dataset to include in the test split.
    Returns:
        Tuple: 
        X_train, y_train: Training set (60%)
        X_val, y_val: Validation set (20%)
        X_test, y_test: Test set (20%)
    """
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=None)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=None)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = create_train_validation_test_sets(X, y)

In [9]:
def train_model(
    X_train: pd.DataFrame, y_train: np.ndarray, categorical_features: list[str] = None
) -> xgb.XGBRegressor:
    """
    Train a LightGBM regressor.
    """
    xgb_model = xgb.XGBRegressor(n_estimators=100, objective="reg:squarederror", eval_metric="rmse", seed=random_seed)
    xgb_model.fit(
        X_train,
        y_train
    )
    return xgb_model

In [19]:
def refit_model(
    X_train: pd.DataFrame, y_train: np.ndarray, init_model: xgb.XGBRegressor
) -> xgb.XGBRegressor:
    """
    """
    xgb_refit = xgb.XGBRegressor(n_estimators=100, objective="reg:squarederror", eval_metric="rmse", seed=random_seed)
    xgb_refit.fit(
        X_train,
        y_train,
        xgb_model=init_model
    )
    return xgb_refit

In [11]:
def validation(predicted_y: np.ndarray, y_true: np.ndarray) -> Dict[str, Any]:
    """
    Evaluate classification performance with various metrics.

    Args:
        predicted_y (np.ndarray): Predicted target
        y_true (np.ndarray): Ground truth.
    Returns:
        dict: A dictionary with metrics
    """
    metrics = {
        "mse": mean_squared_error(y_true, predicted_y),
        "r2": r2_score(y_true, predicted_y)
    }
    return metrics

# Experiment 2:

## Part 1:
1. Train a Regression model using train + validation sets.
2. Evaluate using test set.

In [12]:
model = train_model(X_train=pd.concat([X_train, X_val]), y_train=np.concatenate([y_train, y_val]))

In [13]:
predicted_target_train_val= model.predict(X_test)

In [14]:
validation(predicted_target_train_val, y_test)

{'mse': 2239.2417893487086, 'r2': 0.9680739169420416}

## Part 2:
1. Train model using train set
2. retrain the model using validation sets.
3. Evaluate using test set.

In [15]:
model_train = train_model(X_train=X_train, y_train=y_train)

In [16]:
predicted_target_train = model_train.predict(X_test)

In [17]:
validation(predicted_target_train, y_test)

{'mse': 2348.197238667183, 'r2': 0.9665204800862697}

In [20]:
model_refit_val = refit_model(X_train=X_val, y_train=y_val, init_model=model_train)

In [21]:
predicted_target_refit_val= model_refit_val.predict(X_test)

In [22]:
validation(predicted_target_refit_val, y_test)

{'mse': 2170.069863097486, 'r2': 0.9690601385610199}