In [1]:
# Imports

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load training datasets
heloc = pd.read_csv("heloc_train.csv")
covtype = pd.read_csv("covtype_train.csv")
higgs = pd.read_csv("higgs_train.csv")

# Dataset identifiers
heloc["dataset_id"] = 0
covtype["dataset_id"] = 1
higgs["dataset_id"] = 2


In [3]:
# Global target encoding (unified label space)
# HELOC
heloc["target"] = heloc["RiskPerformance"].map({
    "Bad": 0,
    "Good": 1
})

# HIGGS
higgs["target"] = higgs["Label"].map({
    "b": 0,
    "s": 1
}) + 2

# COVTYPE
covtype["target"] = covtype["Cover_Type"] + 3


In [4]:
# Remove original label columns
heloc = heloc.drop(columns=["RiskPerformance"])
higgs = higgs.drop(columns=["Label"])
covtype = covtype.drop(columns=["Cover_Type"])

In [5]:
# Build unified feature space
heloc_features = [c for c in heloc.columns if c not in ["target", "dataset_id"]]
covtype_features = [c for c in covtype.columns if c not in ["target", "dataset_id"]]
higgs_features = [c for c in higgs.columns if c not in ["target", "dataset_id"]]

ALL_FEATURES = sorted(
    set(heloc_features) |
    set(covtype_features) |
    set(higgs_features)
)

In [6]:
# Align features across datasets
def align_features(df):
    df = df.copy()
    for col in ALL_FEATURES:
        if col not in df.columns:
            df[col] = 0
    return df[ALL_FEATURES + ["dataset_id", "target"]]

heloc = align_features(heloc)
covtype = align_features(covtype)
higgs = align_features(higgs)

In [7]:
# Lock feature columns
FEATURE_COLS = [c for c in heloc.columns if c != "target"]
print("Number of features:", len(FEATURE_COLS))

Number of features: 110


In [8]:
# Combine datasets into one unified table
full_data = pd.concat([heloc, covtype, higgs], ignore_index=True)

X = full_data[FEATURE_COLS]
y = full_data["target"]
dataset_id = full_data["dataset_id"]

In [9]:
# HELOC-WEIGHTED sample weights
counts = full_data["dataset_id"].value_counts().to_dict()

weights = []
for d in dataset_id:
    if d == 0:        # HELOC (hardest)
        weights.append(1.5 / counts[0])
    elif d == 1:      # COVTYPE
        weights.append(1.0 / counts[1])
    else:             # HIGGS (easiest)
        weights.append(0.7 / counts[2])

weights = np.array(weights, dtype=float)
weights = weights / weights.mean()   # normalize

In [10]:
# Train / validation split
X_train, X_val, y_train, y_val, d_train, d_val = train_test_split(
    X,
    y,
    dataset_id,
    test_size=0.2,
    random_state=42,
    stratify=dataset_id
)

w_train = weights[X_train.index]
w_val = weights[X_val.index]

In [11]:
# Train unified LightGBM model
train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
val_data = lgb.Dataset(X_val, label=y_val, weight=w_val)

params = {
    "objective": "multiclass",
    "num_class": 11,
    "metric": "multi_error",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)]
)

best_iter = model.best_iteration
print("Best iteration:", best_iter)

Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_error: 0.186357
[100]	valid_0's multi_error: 0.182428
[150]	valid_0's multi_error: 0.178912
[200]	valid_0's multi_error: 0.177541
[250]	valid_0's multi_error: 0.174597
[300]	valid_0's multi_error: 0.173615
[350]	valid_0's multi_error: 0.170205
[400]	valid_0's multi_error: 0.168645
[450]	valid_0's multi_error: 0.167038
[500]	valid_0's multi_error: 0.166917
Early stopping, best iteration is:
[498]	valid_0's multi_error: 0.165699
Best iteration: 498


In [12]:
# Final retrain on ALL data
final_train = lgb.Dataset(X, label=y, weight=weights)

final_model = lgb.train(
    params,
    final_train,
    num_boost_round=best_iter
)

In [13]:
# Load and prepare test datasets
def prepare_test(df, dataset_id):
    df = df.copy()
    df["dataset_id"] = dataset_id
    for col in FEATURE_COLS:
        if col not in df.columns:
            df[col] = 0
    return df[FEATURE_COLS]

heloc_test = prepare_test(pd.read_csv("heloc_test.csv"), 0)
covtype_test = prepare_test(pd.read_csv("covtype_test.csv"), 1)
higgs_test = prepare_test(pd.read_csv("higgs_test.csv"), 2)

In [14]:
# Predict test sets
heloc_preds = np.argmax(final_model.predict(heloc_test), axis=1)
covtype_preds = np.argmax(final_model.predict(covtype_test), axis=1)
higgs_preds = np.argmax(final_model.predict(higgs_test), axis=1)

In [15]:
# Decode predictions
heloc_final = heloc_preds
higgs_final = higgs_preds - 2
covtype_final = covtype_preds - 3

In [16]:
# Kaggle submission
covtype_sub = pd.DataFrame({
    "ID": np.arange(1, 1 + len(covtype_final)),
    "Prediction": covtype_final
})

heloc_sub = pd.DataFrame({
    "ID": np.arange(3501, 3501 + len(heloc_final)),
    "Prediction": heloc_final
})

higgs_sub = pd.DataFrame({
    "ID": np.arange(4547, 4547 + len(higgs_final)),
    "Prediction": higgs_final
})

final_submission = pd.concat(
    [covtype_sub, heloc_sub, higgs_sub],
    ignore_index=True
)

final_submission.to_csv(
    "final_submission_unified_lgbm_HELOC_weighted.csv",
    index=False
)
