In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from typing import Any, Dict, List, Optional, Tuple

In [2]:
random_seed = 42

In [3]:
# Generate a dataset with 250000 samples, 20 features, and some noise
dataset, target = make_regression(n_samples=250000, n_features=20, n_informative=15, noise=0.1, random_state=random_seed,
                                  bias=5.0)
X = pd.DataFrame(dataset)
X.columns = X.columns.astype(str)
y = target

In [4]:
def create_train_validation_test_sets(
    X: pd.DataFrame, y: np.array
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
    """
    Split the dataset into training, valiidation and testing sets.
    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Labels.
        test_size (float): Proportion of the dataset to include in the test split.
    Returns:
        Tuple: 
        X_train, y_train: Training set (60%)
        X_val, y_val: Validation set (20%)
        X_test, y_test: Test set (20%)
    """
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=None)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=None)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = create_train_validation_test_sets(X, y)

In [6]:
def train_model(
    X_train: pd.DataFrame, y_train: np.ndarray, categorical_features: list[str] = None
) -> lgb.LGBMRegressor:
    """
    Train a LightGBM regressor.
    """
    lgbm = lgb.LGBMRegressor(objective="regression", metric="l2", random_state=random_seed)
    lgbm.fit(
        X_train,
        y_train,
        feature_name=X_train.columns.to_list(),
        categorical_feature=categorical_features,
        eval_metric="l2"
    )
    return lgbm

In [7]:
def refit_model(
    X_train: pd.DataFrame, y_train: np.ndarray, init_model: lgb.LGBMClassifier, categorical_features: list[str] = None
) -> lgb.LGBMClassifier:
    """
    """
    lgbm_refit = lgb.LGBMRegressor(objective="regression", metric="l2", random_state=random_seed)
    lgbm_refit.fit(
        X_train,
        y_train,
        feature_name=X_train.columns.to_list(),
        categorical_feature=categorical_features,
        eval_metric="l2",
        init_model=init_model
    )
    return lgbm_refit

In [8]:
def validation(predicted_y: np.ndarray, y_true: np.ndarray) -> Dict[str, Any]:
    """
    Evaluate classification performance with various metrics.

    Args:
        predicted_y (np.ndarray): Predicted target
        y_true (np.ndarray): Ground truth.
    Returns:
        dict: A dictionary with metrics
    """
    metrics = {
        "mse": mean_squared_error(y_true, predicted_y),
        "r2": r2_score(y_true, predicted_y)
    }
    return metrics

# Experiment 2:

## Part 1:
1. Train a Regression model using train + validation sets.
2. Evaluate using test set.

In [9]:
lgbm = train_model(X_train=pd.concat([X_train, X_val]), y_train=np.concatenate([y_train, y_val]))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 20
[LightGBM] [Info] Start training from score 4.707040


In [10]:
predicted_target_train_val= lgbm.predict(X_test)

In [11]:
validation(predicted_target_train_val, y_test)

{'mse': 2047.151040756386, 'r2': 0.9708126587891235}

## Part 2:
1. Train model using train set
2. retrain the model using validation sets.
3. Evaluate using test set.

In [12]:
lgbm_train = train_model(X_train=X_train, y_train=y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 150000, number of used features: 20
[LightGBM] [Info] Start training from score 5.074552


In [13]:
predicted_target_train = lgbm_train.predict(X_test)

In [14]:
validation(predicted_target_train, y_test)

{'mse': 2078.161304588738, 'r2': 0.9703705286612069}

In [15]:
lgbm_refit_val = refit_model(X_train=X_val, y_train=y_val, init_model=lgbm_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 20


In [16]:
predicted_target_refit_val= lgbm_refit_val.predict(X_test)

In [17]:
validation(predicted_target_refit_val, y_test)

{'mse': 951.9842164776783, 'r2': 0.9864270453911224}