After evaluating dataset-balanced and dataset-aware variants, we further investigate whether balancing class frequencies within each dataset improves performance. This experiment maintains a single unified model while adjusting the training objective to reduce bias toward majority classes.

In [1]:
# Imports
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load Training Data
heloc = pd.read_csv("heloc_train.csv")
covtype = pd.read_csv("covtype_train.csv")
higgs = pd.read_csv("higgs_train.csv")

In [4]:
# Dataset Identifiers
heloc["dataset_id"] = 0
covtype["dataset_id"] = 1
higgs["dataset_id"] = 2

In [5]:
# Global Target Encoding

# HELOC
heloc["target"] = heloc["RiskPerformance"].map({
    "Bad": 0,
    "Good": 1
})

# HIGGS
higgs["target"] = higgs["Label"].map({
    "b": 0,
    "s": 1
}) + 2

# COVTYPE
covtype["target"] = covtype["Cover_Type"] + 3

In [6]:
# Remove Original Label Columns
heloc = heloc.drop(columns=["RiskPerformance"])
higgs = higgs.drop(columns=["Label"])
covtype = covtype.drop(columns=["Cover_Type"])

In [7]:
# Align Feature Spaces
heloc_features = [c for c in heloc.columns if c not in ["target", "dataset_id"]]
covtype_features = [c for c in covtype.columns if c not in ["target", "dataset_id"]]
higgs_features = [c for c in higgs.columns if c not in ["target", "dataset_id"]]

ALL_FEATURES = sorted(
    set(heloc_features) |
    set(covtype_features) |
    set(higgs_features)
)

In [8]:
def align_features(df):
    df = df.copy()
    for col in ALL_FEATURES:
        if col not in df.columns:
            df[col] = 0
    return df[ALL_FEATURES + ["dataset_id", "target"]]

heloc = align_features(heloc)
covtype = align_features(covtype)
higgs = align_features(higgs)

In [9]:
# Combine datasets
full_data = pd.concat([heloc, covtype, higgs], ignore_index=True)

X = full_data.drop(columns=["target"])
y = full_data["target"]
dataset_id = full_data["dataset_id"]

In [10]:
# Dataset + Class-Balanced Weights

# Dataset counts
dataset_counts = full_data["dataset_id"].value_counts().to_dict()

# Class counts (global label space)
class_counts = full_data["target"].value_counts().to_dict()

weights = []

for _, row in full_data.iterrows():
    d = row["dataset_id"]
    y_i = row["target"]

    # Base dataset balance
    w = 1.0 / dataset_counts[int(d)]

    # Class balance
    w = w * (1.0 / class_counts[int(y_i)])

    weights.append(w)

weights = np.array(weights, dtype=float)
weights = weights / weights.mean()

In [11]:
# Train/validation split
X_train, X_val, y_train, y_val, d_train, d_val = train_test_split(
    X,
    y,
    dataset_id,
    test_size=0.2,
    random_state=42,
    stratify=dataset_id
)

w_train = weights[X_train.index]
w_val = weights[X_val.index]

In [12]:
# Train Unified LightGBM Model
train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
val_data = lgb.Dataset(X_val, label=y_val, weight=w_val)

params = {
    "objective": "multiclass",
    "num_class": 11,
    "metric": "multi_error",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(50)]
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[171]	valid_0's multi_error: 0.2162


In [13]:
# Validation Accuracy per Dataset
val_preds = np.argmax(model.predict(X_val), axis=1)

results = {}

results["HELOC"] = accuracy_score(
    y_val[d_val == 0],
    val_preds[d_val == 0]
)

results["COVTYPE"] = accuracy_score(
    y_val[d_val == 1],
    val_preds[d_val == 1]
)

results["HIGGS"] = accuracy_score(
    y_val[d_val == 2],
    val_preds[d_val == 2]
)

mean_acc = np.mean(list(results.values()))

print("Validation accuracy per dataset:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")
print(f"Mean accuracy: {mean_acc:.4f}")


Validation accuracy per dataset:
HELOC: 0.7270
COVTYPE: 0.8448
HIGGS: 1.0000
Mean accuracy: 0.8573


In [14]:
# Load & Prepare Test Sets
def prepare_test(df, dataset_id):
    df = df.copy()
    df["dataset_id"] = dataset_id
    for col in ALL_FEATURES:
        if col not in df.columns:
            df[col] = 0
    return df[ALL_FEATURES + ["dataset_id"]]

heloc_test = prepare_test(pd.read_csv("heloc_test.csv"), 0)
covtype_test = prepare_test(pd.read_csv("covtype_test.csv"), 1)
higgs_test = prepare_test(pd.read_csv("higgs_test.csv"), 2)

In [15]:
# Predict & Decode
heloc_preds = np.argmax(model.predict(heloc_test), axis=1)
covtype_preds = np.argmax(model.predict(covtype_test), axis=1)
higgs_preds = np.argmax(model.predict(higgs_test), axis=1)

heloc_final = heloc_preds
higgs_final = higgs_preds - 2
covtype_final = covtype_preds - 3

In [16]:
# Kaggle submission
final_submission = pd.concat([
    pd.DataFrame({"ID": np.arange(1, 1+len(covtype_final)), "Prediction": covtype_final}),
    pd.DataFrame({"ID": np.arange(3501, 3501+len(heloc_final)), "Prediction": heloc_final}),
    pd.DataFrame({"ID": np.arange(4547, 4547+len(higgs_final)), "Prediction": higgs_final})
], ignore_index=True)

final_submission.to_csv(
    "final_submission_unified_lgbm_class_balanced.csv",
    index=False
)