In this project, we study whether a single machine learning model can be trained to perform well across multiple heterogeneous tabular datasets. Instead of training separate domain-specific models, we reformulate the problem as a single unified multi-class classification task. This approach aligns with recent work on representation learning for tabular data and allows us to evaluate how well one model can generalize across different domains.

In [11]:
# Basic libraries
import pandas as pd
import numpy as np

# LightGBM
import lightgbm as lgb

# Sklearn utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:
# Load datasets
heloc = pd.read_csv("heloc_train.csv")
covtype = pd.read_csv("covtype_train.csv")
higgs = pd.read_csv("higgs_train.csv")

# Dataset identifiers
heloc["dataset_id"] = 0
covtype["dataset_id"] = 1
higgs["dataset_id"] = 2

In [13]:
# Global target encoding
#
# HELOC:
#   0 = Bad
#   1 = Good
#
# HIGGS:
#   2 = Background
#   3 = Signal
#
# COVTYPE:
#   4–10 = Forest cover types 1–7

# HELOC
heloc["target"] = heloc["RiskPerformance"].map({
    "Bad": 0,
    "Good": 1
})

# HIGGS
higgs["target"] = higgs["Label"].map({
    "b": 0,
    "s": 1
}) + 2

# COVTYPE
covtype["target"] = covtype["Cover_Type"] + 3

In [14]:
# Remove original label columns
heloc = heloc.drop(columns=["RiskPerformance"])
higgs = higgs.drop(columns=["Label"])
covtype = covtype.drop(columns=["Cover_Type"])

In [15]:
# Feature lists per dataset
heloc_features = [c for c in heloc.columns if c not in ["target", "dataset_id"]]
covtype_features = [c for c in covtype.columns if c not in ["target", "dataset_id"]]
higgs_features = [c for c in higgs.columns if c not in ["target", "dataset_id"]]

# Union of all features
ALL_FEATURES = sorted(
    set(heloc_features) |
    set(covtype_features) |
    set(higgs_features)
)

In [16]:
# Align features across datasets
def align_features(df):
    df = df.copy()
    for col in ALL_FEATURES:
        if col not in df.columns:
            df[col] = 0
    return df[ALL_FEATURES + ["dataset_id", "target"]]

heloc = align_features(heloc)
covtype = align_features(covtype)
higgs = align_features(higgs)

In [17]:
# Lock feature columns
FEATURE_COLS = [c for c in heloc.columns if c != "target"]

print("Number of features:", len(FEATURE_COLS))

Number of features: 110


In [18]:
# Combine datasets into one unified table
full_data = pd.concat([heloc, covtype, higgs], ignore_index=True)

X = full_data[FEATURE_COLS]
y = full_data["target"]
dataset_id = full_data["dataset_id"]

In [19]:
# Give each dataset equal total weight
counts = full_data["dataset_id"].value_counts().to_dict()

weights = []
for did in full_data["dataset_id"]:
    weights.append(1.0 / counts[int(did)])

weights = np.array(weights, dtype=float)
weights = weights / weights.mean()

In [21]:
# Train/validation split
X_train, X_val, y_train, y_val, d_train, d_val = train_test_split(
    X,
    y,
    dataset_id,
    test_size=0.2,
    random_state=42,
    stratify=dataset_id
)

w_train = weights[X_train.index]
w_val = weights[X_val.index]

In [22]:
# Train one unified LightGBM model
train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
val_data = lgb.Dataset(X_val, label=y_val, weight=w_val)

params = {
    "objective": "multiclass",
    "num_class": 11,
    "metric": "multi_error",   # optimize accuracy
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(50)
    ]
)

Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_error: 0.151063
[100]	valid_0's multi_error: 0.146745
[150]	valid_0's multi_error: 0.143274
[200]	valid_0's multi_error: 0.139238
[250]	valid_0's multi_error: 0.138489
[300]	valid_0's multi_error: 0.136007
[350]	valid_0's multi_error: 0.134376
[400]	valid_0's multi_error: 0.133252
Early stopping, best iteration is:
[365]	valid_0's multi_error: 0.132649


In [23]:
# Validation results per dataset
val_preds = np.argmax(model.predict(X_val), axis=1)

results = {}

results["HELOC"] = accuracy_score(
    y_val[d_val == 0],
    val_preds[d_val == 0]
)

results["COVTYPE"] = accuracy_score(
    y_val[d_val == 1],
    val_preds[d_val == 1]
)

results["HIGGS"] = accuracy_score(
    y_val[d_val == 2],
    val_preds[d_val == 2]
)

mean_acc = np.mean(list(results.values()))

print("Validation accuracy per dataset:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")
print(f"Mean accuracy: {mean_acc:.4f}")

Validation accuracy per dataset:
HELOC: 0.7212
COVTYPE: 0.8809
HIGGS: 1.0000
Mean accuracy: 0.8674


In [24]:
# Load test datasets
heloc_test = pd.read_csv("heloc_test.csv")
covtype_test = pd.read_csv("covtype_test.csv")
higgs_test = pd.read_csv("higgs_test.csv")

In [25]:
# Prepare test sets
def prepare_test(df, dataset_id):
    df = df.copy()
    df["dataset_id"] = dataset_id
    for col in FEATURE_COLS:
        if col not in df.columns:
            df[col] = 0
    return df[FEATURE_COLS]

heloc_test = prepare_test(heloc_test, 0)
covtype_test = prepare_test(covtype_test, 1)
higgs_test = prepare_test(higgs_test, 2)

In [26]:
# Predict test sets
heloc_preds = np.argmax(model.predict(heloc_test), axis=1)
covtype_preds = np.argmax(model.predict(covtype_test), axis=1)
higgs_preds = np.argmax(model.predict(higgs_test), axis=1)

In [27]:
# Decode predictions
heloc_final = heloc_preds
higgs_final = higgs_preds - 2
covtype_final = covtype_preds - 3

In [28]:
# Kaggle submission
covtype_sub = pd.DataFrame({
    "ID": np.arange(1, 1 + len(covtype_final)),
    "Prediction": covtype_final
})

heloc_sub = pd.DataFrame({
    "ID": np.arange(3501, 3501 + len(heloc_final)),
    "Prediction": heloc_final
})

higgs_sub = pd.DataFrame({
    "ID": np.arange(4547, 4547 + len(higgs_final)),
    "Prediction": higgs_final
})

final_submission = pd.concat(
    [covtype_sub, heloc_sub, higgs_sub],
    ignore_index=True
)

final_submission.to_csv(
    "final_submission_unified_lgbm_balanced_g9.csv",
    index=False
)