In this experiment, we further refine our unified tabular learning approach by explicitly encoding dataset identity using one-hot features.
While we still train a single LightGBM model across all datasets, this allows the model to learn dataset-specific patterns in a principled and interpretable way.

This approach remains fully compliant with the assignment requirement of not training separate domain-specific models.

In [1]:
# Imports

import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
# Load Training Data
heloc = pd.read_csv("heloc_train.csv")
covtype = pd.read_csv("covtype_train.csv")
higgs = pd.read_csv("higgs_train.csv")

In [3]:
# Dataset Identifiers
heloc["dataset_id"] = 0
covtype["dataset_id"] = 1
higgs["dataset_id"] = 2

In [4]:
# Global Target Encoding
# HELOC: Bad=0, Good=1
heloc["target"] = heloc["RiskPerformance"].map({
    "Bad": 0,
    "Good": 1
})

# HIGGS: b/s → 0/1 → shift to 2/3
higgs["target"] = higgs["Label"].map({
    "b": 0,
    "s": 1
}) + 2

# COVTYPE: 1–7 → 4–10
covtype["target"] = covtype["Cover_Type"] + 3

In [5]:
# Remove Original Label Columns
heloc = heloc.drop(columns=["RiskPerformance"])
higgs = higgs.drop(columns=["Label"])
covtype = covtype.drop(columns=["Cover_Type"])

In [6]:
# Align Feature Spaces
heloc_features = [c for c in heloc.columns if c not in ["target", "dataset_id"]]
covtype_features = [c for c in covtype.columns if c not in ["target", "dataset_id"]]
higgs_features = [c for c in higgs.columns if c not in ["target", "dataset_id"]]

ALL_FEATURES = sorted(
    set(heloc_features) |
    set(covtype_features) |
    set(higgs_features)
)

In [7]:
def align_features(df):
    df = df.copy()
    for col in ALL_FEATURES:
        if col not in df.columns:
            df[col] = 0
    return df[ALL_FEATURES + ["dataset_id", "target"]]

heloc = align_features(heloc)
covtype = align_features(covtype)
higgs = align_features(higgs)

In [8]:
# Combine Datasets
full_data = pd.concat([heloc, covtype, higgs], ignore_index=True)

In [9]:
# One-Hot Encode Dataset Identity
X = full_data.drop(columns=["target"])

X["ds_heloc"] = (X["dataset_id"] == 0).astype(int)
X["ds_covtype"] = (X["dataset_id"] == 1).astype(int)
X["ds_higgs"] = (X["dataset_id"] == 2).astype(int)

X = X.drop(columns=["dataset_id"])

  X["ds_heloc"] = (X["dataset_id"] == 0).astype(int)
  X["ds_covtype"] = (X["dataset_id"] == 1).astype(int)
  X["ds_higgs"] = (X["dataset_id"] == 2).astype(int)


In [10]:
y = full_data["target"]
dataset_id = full_data["dataset_id"]

In [11]:
# Train / Validation Split
X_train, X_val, y_train, y_val, d_train, d_val = train_test_split(
    X,
    y,
    dataset_id,
    test_size=0.2,
    random_state=42,
    stratify=dataset_id
)

In [12]:
# Train Unified LightGBM Model
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "multiclass",
    "num_class": 11,
    "metric": "multi_error",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "verbosity": -1,
    "seed": 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(50)]
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[118]	valid_0's multi_error: 0.055646


In [13]:
# Validation Accuracy per Dataset
val_preds = np.argmax(model.predict(X_val), axis=1)

results = {}

results["HELOC"] = accuracy_score(
    y_val[d_val == 0],
    val_preds[d_val == 0]
)

results["COVTYPE"] = accuracy_score(
    y_val[d_val == 1],
    val_preds[d_val == 1]
)

results["HIGGS"] = accuracy_score(
    y_val[d_val == 2],
    val_preds[d_val == 2]
)

mean_acc = np.mean(list(results.values()))

print("Validation accuracy per dataset:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")
print(f"Mean accuracy: {mean_acc:.4f}")


Validation accuracy per dataset:
HELOC: 0.7169
COVTYPE: 0.8136
HIGGS: 1.0000
Mean accuracy: 0.8435


In [14]:
# Load Test Data
heloc_test = pd.read_csv("heloc_test.csv")
covtype_test = pd.read_csv("covtype_test.csv")
higgs_test = pd.read_csv("higgs_test.csv")

In [15]:
# Prepare Test Sets
def prepare_test(df, dataset_id):
    df = df.copy()
    df["dataset_id"] = dataset_id

    for col in ALL_FEATURES:
        if col not in df.columns:
            df[col] = 0

    X = df[ALL_FEATURES]
    X["ds_heloc"] = (dataset_id == 0)
    X["ds_covtype"] = (dataset_id == 1)
    X["ds_higgs"] = (dataset_id == 2)

    return X

heloc_test = prepare_test(heloc_test, 0)
covtype_test = prepare_test(covtype_test, 1)
higgs_test = prepare_test(higgs_test, 2)

In [16]:
# Predict Test Sets
heloc_preds = np.argmax(model.predict(heloc_test), axis=1)
covtype_preds = np.argmax(model.predict(covtype_test), axis=1)
higgs_preds = np.argmax(model.predict(higgs_test), axis=1)

In [17]:
# Decode Predictions
heloc_final = heloc_preds              # already 0/1
higgs_final = higgs_preds - 2          # 2/3 → 0/1
covtype_final = covtype_preds - 3      # 4–10 → 1–7

In [18]:
# Kagglr submission
covtype_sub = pd.DataFrame({
    "ID": np.arange(1, 1 + len(covtype_final)),
    "Prediction": covtype_final
})

heloc_sub = pd.DataFrame({
    "ID": np.arange(3501, 3501 + len(heloc_final)),
    "Prediction": heloc_final
})

higgs_sub = pd.DataFrame({
    "ID": np.arange(4547, 4547 + len(higgs_final)),
    "Prediction": higgs_final
})

final_submission = pd.concat(
    [covtype_sub, heloc_sub, higgs_sub],
    ignore_index=True
)

final_submission.to_csv(
    "final_submission_group9_onehot_datasetid.csv",
    index=False
)