### Alternative Unified Label Space (9-class)

In early experiments, we also evaluated a 9-class unified formulation where the binary targets of HELOC and HIGGS were mapped to a shared label space, while the 7 CoverType classes occupied the remaining labels. This reduced the total number of classes but introduced semantic ambiguity between datasets with very different meanings (credit risk vs. particle physics).

While this formulation produced reasonable performance, we ultimately adopted an 11-class setup in the final model to preserve dataset-specific target semantics and avoid unintended label collisions. The 9-class experiment is included for completeness as an ablation of the label design choice.


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split


# Load data

heloc = pd.read_csv("heloc_train.csv")
covtype = pd.read_csv("covtype_train.csv")
higgs = pd.read_csv("higgs_train.csv")

# Drop proxy / ID columns from HIGGS
DROP_HIGGS_COLS = ["Weight", "weight", "EventId", "EventID"]
for col in DROP_HIGGS_COLS:
    if col in higgs.columns:
        higgs = higgs.drop(columns=[col])

# Dataset IDs
heloc["dataset_id"] = 0
covtype["dataset_id"] = 1
higgs["dataset_id"] = 2


#  9-class target mapping (experimental)

# HELOC: 0 / 1
heloc["target"] = heloc["RiskPerformance"].map({"Bad": 0, "Good": 1})

# HIGGS: ALSO 0 / 1 (collapsed)
higgs["target"] = higgs["Label"].map({"b": 0, "s": 1})

# CoverType: shift to avoid overlap → 2..8
covtype["target"] = covtype["Cover_Type"] + 1  # 1..7 → 2..8

# Drop original labels
heloc = heloc.drop(columns=["RiskPerformance"])
higgs = higgs.drop(columns=["Label"])
covtype = covtype.drop(columns=["Cover_Type"])


# Align features

def feature_cols(df):
    return [c for c in df.columns if c not in ["target", "dataset_id"]]

ALL_FEATURES = sorted(
    set(feature_cols(heloc)) |
    set(feature_cols(covtype)) |
    set(feature_cols(higgs))
)

def align(df):
    df = df.copy()
    for c in ALL_FEATURES:
        if c not in df.columns:
            df[c] = 0
    return df[ALL_FEATURES + ["dataset_id", "target"]]

heloc = align(heloc)
covtype = align(covtype)
higgs = align(higgs)

full = pd.concat([heloc, covtype, higgs], ignore_index=True)

X = full.drop(columns=["target"])
y = full["target"]

print("Unique targets:", sorted(y.unique()))
print("Number of classes:", y.nunique())  # should be 9


# Weights (same logic as final model)

dataset_counts = full["dataset_id"].value_counts().to_dict()
class_counts = full["target"].value_counts().to_dict()

weights = []
for _, r in full.iterrows():
    w = 1 / dataset_counts[int(r["dataset_id"])]
    w *= 1 / np.sqrt(class_counts[int(r["target"])])
    weights.append(w)

weights = np.array(weights)
weights /= weights.mean()


# Train / validation

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=full["dataset_id"]
)

w_tr = weights[X_tr.index]
w_va = weights[X_va.index]

train_data = lgb.Dataset(X_tr, label=y_tr, weight=w_tr)
val_data = lgb.Dataset(X_va, label=y_va, weight=w_va)

params = {
    "objective": "multiclass",
    "num_class": 9,
    "metric": "multi_error",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "verbosity": -1,
    "seed": 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(50)]
)

best_iteration = model.best_iteration
print("Best iteration:", best_iteration)


Unique targets: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8)]
Number of classes: 9
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[331]	valid_0's multi_error: 0.176267
Best iteration: 331


In [None]:
heloc_test = pd.read_csv("heloc_test.csv")
covtype_test = pd.read_csv("covtype_test.csv")
higgs_test = pd.read_csv("higgs_test.csv")


In [9]:
# Drop proxy / ID-like columns from HIGGS test
for col in ["Weight", "weight", "EventId", "EventID"]:
    if col in higgs_test.columns:
        higgs_test = higgs_test.drop(columns=[col])


In [10]:
heloc_test = prepare_test(heloc_test, 0)
covtype_test = prepare_test(covtype_test, 1)
higgs_test = prepare_test(higgs_test, 2)


In [11]:
heloc_pred = final_model.predict(heloc_test).argmax(axis=1)
covtype_pred = final_model.predict(covtype_test).argmax(axis=1)
higgs_pred = final_model.predict(higgs_test).argmax(axis=1)

In [None]:
# Decode predictions back to Kaggle label space
heloc_final = heloc_pred              
higgs_final = higgs_pred              
covtype_final = covtype_pred - 1      

In [15]:
print("Decoded CovType labels:", np.unique(covtype_final))
print("Decoded HEL0C labels:", np.unique(heloc_final))
print("Decoded HIGGS labels:", np.unique(higgs_final))

Decoded CovType labels: [1 2 3 4 5 6 7]
Decoded HEL0C labels: [0 1]
Decoded HIGGS labels: [0 1]


In [19]:
final_submission = pd.concat([
    pd.DataFrame({
        "ID": np.arange(1, 1 + len(covtype_final)),
        "Prediction": covtype_final
    }),
    pd.DataFrame({
        "ID": np.arange(3501, 3501 + len(heloc_final)),
        "Prediction": heloc_final
    }),
    pd.DataFrame({
        "ID": np.arange(4547, 4547 + len(higgs_final)),
        "Prediction": higgs_final
    })
], ignore_index=True)

final_submission.to_csv(
    "AML_ablation_exp2_classes9_group09.csv",
    index=False
)

final_submission.head()


Unnamed: 0,ID,Prediction
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


In [21]:
print("Unique covtype predictions:", np.unique(covtype_pred))
print("Unique heloc predictions:", np.unique(heloc_pred))
print("Unique higgs predictions:", np.unique(higgs_pred))


Unique covtype predictions: [2 3 4 5 6 7 8]
Unique heloc predictions: [0 1]
Unique higgs predictions: [0 1]
