We reformulate three heterogeneous tabular classification tasks into a single unified multi-class learning problem and train one LightGBM model across all datasets. After selecting hyperparameters using a validation split and early stopping, we retrain the model on the full training data using the optimal number of boosting rounds to maximize test performance.

In [1]:
# Imports

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
# Load datasets + dataset_id
heloc = pd.read_csv("heloc_train.csv")
covtype = pd.read_csv("covtype_train.csv")
higgs = pd.read_csv("higgs_train.csv")

heloc["dataset_id"] = 0
covtype["dataset_id"] = 1
higgs["dataset_id"] = 2

In [3]:
# Global target encoding
heloc["target"] = heloc["RiskPerformance"].map({"Bad": 0, "Good": 1})

higgs["target"] = higgs["Label"].map({"b": 0, "s": 1}) + 2

covtype["target"] = covtype["Cover_Type"] + 3

In [4]:
# Drop original labels
heloc = heloc.drop(columns=["RiskPerformance"])
higgs = higgs.drop(columns=["Label"])
covtype = covtype.drop(columns=["Cover_Type"])

In [5]:
# Unified feature space
heloc_features = [c for c in heloc.columns if c not in ["target", "dataset_id"]]
covtype_features = [c for c in covtype.columns if c not in ["target", "dataset_id"]]
higgs_features = [c for c in higgs.columns if c not in ["target", "dataset_id"]]

ALL_FEATURES = sorted(set(heloc_features) | set(covtype_features) | set(higgs_features))

In [6]:
# Align features
def align_features(df):
    df = df.copy()
    for col in ALL_FEATURES:
        if col not in df.columns:
            df[col] = 0
    return df[ALL_FEATURES + ["dataset_id", "target"]]

heloc = align_features(heloc)
covtype = align_features(covtype)
higgs = align_features(higgs)

In [7]:
# Lock features
FEATURE_COLS = [c for c in heloc.columns if c != "target"]

In [8]:
# Combine datasets
full_data = pd.concat([heloc, covtype, higgs], ignore_index=True)

X = full_data[FEATURE_COLS]
y = full_data["target"]
dataset_id = full_data["dataset_id"]


In [9]:
# Dataset-balanced weights
counts = full_data["dataset_id"].value_counts().to_dict()

weights = np.array([1.0 / counts[int(d)] for d in dataset_id])
weights = weights / weights.mean()


In [10]:
# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=dataset_id
)

w_train = weights[X_train.index]
w_val = weights[X_val.index]

In [11]:
# Train with early stopping
train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
val_data = lgb.Dataset(X_val, label=y_val, weight=w_val)

params = {
    "objective": "multiclass",
    "num_class": 11,
    "metric": "multi_error",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(50)]
)

best_iter = model.best_iteration
print("Best iteration:", best_iter)


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[365]	valid_0's multi_error: 0.132649
Best iteration: 365


In [12]:
# Final retrain on all data
final_train = lgb.Dataset(X, label=y, weight=weights)

final_model = lgb.train(
    params,
    final_train,
    num_boost_round=best_iter
)


In [13]:
# Load & prepare test sets
def prepare_test(df, dataset_id):
    df = df.copy()
    df["dataset_id"] = dataset_id
    for col in FEATURE_COLS:
        if col not in df.columns:
            df[col] = 0
    return df[FEATURE_COLS]

heloc_test = prepare_test(pd.read_csv("heloc_test.csv"), 0)
covtype_test = prepare_test(pd.read_csv("covtype_test.csv"), 1)
higgs_test = prepare_test(pd.read_csv("higgs_test.csv"), 2)

In [14]:
# Predict & decode
heloc_final = np.argmax(final_model.predict(heloc_test), axis=1)
covtype_final = np.argmax(final_model.predict(covtype_test), axis=1) - 3
higgs_final = np.argmax(final_model.predict(higgs_test), axis=1) - 2

In [15]:
# Kaggle submission
final_submission = pd.concat([
    pd.DataFrame({"ID": np.arange(1, 1+len(covtype_final)), "Prediction": covtype_final}),
    pd.DataFrame({"ID": np.arange(3501, 3501+len(heloc_final)), "Prediction": heloc_final}),
    pd.DataFrame({"ID": np.arange(4547, 4547+len(higgs_final)), "Prediction": higgs_final})
])

final_submission.to_csv("submission_retrain_all_data.csv", index=False)