In [3]:
# %% [markdown]
# # 0. Imports & Í∏∞Î≥∏ ÏÑ§Ï†ï

# %%
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
from sklearn.base import clone

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

RANDOM_STATE = 42
N_SPLITS = 10


In [1]:
import numpy as np
import pandas as pd

def add_eda_rule_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 1) chem_01_range Íµ¨Í∞ÑÌôî
    df['chem_01_range'] = pd.cut(
        df['chem_01'], 
        bins=[-np.inf, 2.0, 4.0, np.inf], 
        labels=['Normal_Low', 'Warning_Zone', 'Danger_High']
    )

    # 2) trace_metal_range Íµ¨Í∞ÑÌôî
    df['trace_metal_range'] = pd.cut(
        df['trace_metal'],
        bins=[-np.inf, 90, 130, np.inf],
        labels=['Safe', 'Caution', 'Risk']
    )

    # 3) swelling + chem_01 Ï°∞Ìï©ÏúºÎ°ú risk_segment ÎßåÎì§Í∏∞
    def check_ambiguous_risk(row):
        # S Ïù¥Î©¥ÏÑú chem_01Ïù¥ 2~5 ÏÇ¨Ïù¥ ‚Üí Ï£ºÏùò(1) ÏùòÏã¨
        if (row['swelling'] == 'S') and (2.0 <= row['chem_01'] < 5.0):
            return 'Target_1_Suspect'
        # swellingÏù¥ Y Ïù¥Í±∞ÎÇò chem_01Ïù¥ 5 Ïù¥ÏÉÅ ‚Üí ÏúÑÌóò(2)
        elif (row['swelling'] == 'Y') or (row['chem_01'] >= 5.0):
            return 'High_Risk'
        # ÎÇòÎ®∏ÏßÄÎäî Normal
        else:
            return 'Normal'

    df['risk_segment'] = df.apply(check_ambiguous_risk, axis=1)

    # ‚ö†Ô∏è XGBoostÎäî object Î•º Î™ª Î®πÏúºÎãàÍπå Ïà´ÏûêÎ°ú Î∞îÍøîÏïº Ìï®
    chem_map = {'Normal_Low': 0, 'Warning_Zone': 1, 'Danger_High': 2}
    metal_map = {'Safe': 0, 'Caution': 1, 'Risk': 2}
    risk_map = {'Normal': 0, 'Target_1_Suspect': 1, 'High_Risk': 2}

    df['chem_01_range']     = df['chem_01_range'].map(chem_map).astype(int)
    df['trace_metal_range'] = df['trace_metal_range'].map(metal_map).astype(int)
    df['risk_segment']      = df['risk_segment'].map(risk_map).astype(int)

    return df


In [4]:
# %% [markdown]
# # 1. ÌèâÍ∞Ä ÏßÄÌëú & OOF Ïú†Ìã∏ Ìï®Ïàò

# %%
def map_macro(y_true, proba):
    """
    macro Average Precision (mAP)
    """
    return average_precision_score(y_true, proba, average="macro")


def get_oof_proba(model, X, y, n_splits=N_SPLITS, random_state=RANDOM_STATE, name="MODEL"):
    """
    Ï£ºÏñ¥ÏßÑ Î™®Îç∏Î°ú StratifiedKFold OOF ÌôïÎ•† ÏÉùÏÑ±
    - model: scikit-learn estimator (XGB/LGBM/CatBoost Îã§ Í∞ÄÎä•)
    - X: DataFrame ÎòêÎäî ndarray
    - y: 1D array-like
    """
    X_arr = np.asarray(X)
    y_arr = np.asarray(y)

    skf = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=random_state
    )

    n_classes = len(np.unique(y_arr))
    oof_proba = np.zeros((len(y_arr), n_classes), dtype=float)
    scores = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_arr, y_arr), 1):
        X_tr, X_val = X_arr[tr_idx], X_arr[val_idx]
        y_tr, y_val = y_arr[tr_idx], y_arr[val_idx]

        m = clone(model)
        m.fit(X_tr, y_tr)

        proba_val = m.predict_proba(X_val)
        oof_proba[val_idx] = proba_val

        score = map_macro(y_val, proba_val)
        print(f"[{name}] Fold {fold}/{n_splits} mAP = {score:.6f}")
        scores.append(score)

    print(f"[{name}] OOF mean mAP = {np.mean(scores):.6f}")
    return oof_proba, scores


In [None]:
# %% [markdown]
# # 2. ÌîºÏ≤ò ÏóîÏßÄÎãàÏñ¥ÎßÅ (boosting ÌÜµÌï©)

# %%
def prepare_raw(df: pd.DataFrame) -> pd.DataFrame:
    """
    - birth_date ‚Üí age
    - gender: M/F ‚Üí 0/1
    ÎÇòÎ®∏ÏßÄ Ïª¨ÎüºÏùÄ Í∑∏ÎåÄÎ°ú ÎëêÍ≥†, Ïù¥ÌõÑ Ìï®ÏàòÏóêÏÑú Í∞ÄÍ≥µ
    """
    df = df.copy()

    # birth_date ‚Üí age
    if "birth_date" in df.columns:
        df["birth_date"] = pd.to_datetime(df["birth_date"])
        df["age"] = 2025 - df["birth_date"].dt.year

    # gender: M/F ‚Üí 0/1
    if "gender" in df.columns:
        df["gender"] = df["gender"].map({"M": 0, "F": 1})

    return df


def make_boosting_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Í±∞ÎåÄ ÌîºÏ≤ò: health_index, Í∞ÅÏ¢Ö ÎπÑÏú®/Ìï©/Ï∞®, Í∑∏Î£π ÌÜµÍ≥Ñ Îì±
    (Ïù¥Ï†ÑÏóê Ïì∞Îçò make_boosting_features Ï†ïÎ¶¨ Î≤ÑÏ†Ñ)
    """
    df = df.copy()

    # 1. health_index & health_per_stage
    high_is_healthy = ["protein_level", "blood_cells"]
    low_is_healthy  = ["enzyme_A", "enzyme_B", "lipid_index", "trace_metal", "clot_time"]

    def minmax(s):
        s = s.astype(float)
        mn, mx = s.min(), s.max()
        if mx == mn:
            return pd.Series(0.5, index=s.index)
        return (s - mn) / (mx - mn + 1e-6)

    if all(col in df.columns for col in high_is_healthy + low_is_healthy):
        df["health_index"] = (
            sum(minmax(df[col]) for col in high_is_healthy)
            - sum(minmax(df[col]) for col in low_is_healthy)
        )
        df["health_per_stage"] = df["health_index"] / (df["disease_stage"] + 1e-6)

    # 2. ÏÉÅÌò∏ÏûëÏö© / ÎπÑÏú® / Ìï©¬∑Ï∞®
    df["chem01_trace_combo"]   = df["chem_01"] * df["trace_metal"]
    df["chem01_chem02_combo"]  = df["chem_01"] * df["chem_02"]
    df["chem01_enzymeB_combo"] = df["chem_01"] * df["enzyme_B"]

    df["enzyme_ratio"]       = df["enzyme_A"] / (df["enzyme_B"].replace(0, np.nan) + 1e-6)
    df["lipid_blood_ratio"]  = df["lipid_index"] / (df["blood_cells"].replace(0, np.nan) + 1e-6)
    df["chem02_trace_ratio"] = df["chem_02"] / (df["trace_metal"].replace(0, np.nan) + 1e-6)

    df["sum_chem"]   = df["chem_01"] + df["chem_02"]
    df["diff_chem"]  = df["chem_01"] - df["chem_02"]
    df["sum_enzyme"] = df["enzyme_A"] + df["enzyme_B"]
    df["diff_enzyme"] = df["enzyme_A"] - df["enzyme_B"]

    # 3. ÏãúÍ∞Ñ/ÎÇòÏù¥ Ï†ïÍ∑úÌôî
    df["obs_per_age"]      = df["obs_days"] / (df["age"] + 1e-6)
    df["behavior_per_age"] = df["behavior_index"] / (df["age"] + 1e-6)
    df["disease_velocity"] = df["disease_stage"] / (df["obs_days"] + 1e-6)

    # 4. Ï¶ùÏÉÅ Ïπ¥Ïö¥Ìä∏ & Ï§ëÏ¶ùÎèÑ
    symptom_cols = ["fluid_accum", "organ_enlarge", "vascular_marks"]
    for col in symptom_cols:
        if col in df.columns:
            df[col] = df[col].replace({"Y": 1, "N": 0})
    df["symptom_count"] = df[symptom_cols].sum(axis=1)

    # swelling: N/S/Y ‚Üí 0/1/2
    if "swelling" in df.columns:
        df["swelling_ord"] = df["swelling"].map({"N": 0, "S": 1, "Y": 2})
    else:
        df["swelling_ord"] = 0

    df["symptom_severity"] = df["symptom_count"] + df["swelling_ord"]

    # 5. Îû≠ÌÅ¨ Í∏∞Î∞ò
    for col in ["chem_01", "trace_metal", "enzyme_A", "enzyme_B", "lipid_index"]:
        df[f"{col}_rank"] = df[col].rank(method="average") / len(df)

    # 6. age_group & Í∑∏Î£π ÌÜµÍ≥Ñ
    bins   = [60, 80, 90, 100, 130]
    labels = ["60-79", "80-89", "90-99", "100+"]

    df["age_group"] = pd.cut(
        df["age"], bins=bins, labels=labels,
        right=False, include_lowest=True
    )

    group_cols = ['disease_stage', 'gender', 'age_group']
    agg_cols = ['enzyme_A', 'enzyme_B', 'protein_level', 'immune_index',
                'lipid_index', 'blood_cells']

    for group_col in group_cols:
        for col in agg_cols:
            mean_val = df.groupby(group_col)[col].transform('mean')
            std_val  = df.groupby(group_col)[col].transform('std')
            max_val  = df.groupby(group_col)[col].transform('max')
            min_val  = df.groupby(group_col)[col].transform('min')

            prefix = f'{col}_by_{group_col}'
            df[f'{prefix}_diff_mean']  = df[col] - mean_val
            df[f'{prefix}_ratio_mean'] = df[col] / (mean_val + 1e-6)
            df[f'{prefix}_zscore']     = (df[col] - mean_val) / (std_val + 1e-6)
            df[f'{prefix}_minmax']     = (df[col] - min_val) / (max_val - min_val + 1e-6)

    # 7. Ï†ïÍ∑úÌôî ÎπÑÏú® & Î∞∏Îü∞Ïä§
    df['enzyme_A_per_age'] = df['enzyme_A'] / (df['age'] + 1)
    df['enzyme_B_per_age'] = df['enzyme_B'] / (df['age'] + 1)
    df['immune_per_age']   = df['immune_index'] / (df['age'] + 1)
    df['lipid_per_age']    = df['lipid_index'] / (df['age'] + 1)

    df['enzyme_A_per_obs'] = df['enzyme_A'] / (df['obs_days'] + 1)
    df['immune_per_obs']   = df['immune_index'] / (df['obs_days'] + 1)

    df['chem_balance'] = df['chem_01'] / (df['chem_01'] + df['chem_02'] + 1e-6)
    df['immune_behavior_balance'] = df['immune_index'] / (df['immune_index'] + df['behavior_index'] + 1e-6)

    # 8. Î≥µÌï© ÏÉÅÌò∏ÏûëÏö©
    df['stage_symptom_interaction'] = df['disease_stage'] * (df['symptom_count'] + 1)
    df['stage_immune_ratio']        = df['disease_stage'] / (df['immune_index'] + 1)
    df['treatment_immune_ratio']    = df['treatment'] / (df['immune_index'] + 1)
    df['treatment_symptom_ratio']   = df['treatment'] / (df['symptom_count'] + 1)

    # 9. inf/NaN Ï≤òÎ¶¨
    df = df.replace([np.inf, -np.inf], np.nan)

    #10. ÏúÑÌóò ÏöîÏÜå
    df['chem_01_range'] = pd.cut(
    df['chem_01'], 
    bins=[-np.inf, 2.0, 4.0, np.inf], 
    labels=['Normal_Low', 'Warning_Zone', 'Danger_High']
    )

    # 2. trace_metal Íµ¨Í∞ÑÌôî
    # 90 Ïù¥ÌïòÎäî ÏïàÏ†Ñ, 90~130ÏùÄ Ï£ºÏùò Íµ¨Í∞Ñ, 130 Ïù¥ÏÉÅÏùÄ ÏúÑÌóò
    df['trace_metal_range'] = pd.cut(
        df['trace_metal'],
        bins=[-np.inf, 90, 130, np.inf],
        labels=['Safe', 'Caution', 'Risk']
    )

    # 3. [Í≥†Í∏â] "Ïï†Îß§Ìïú Ï¶ùÏÉÅ + ÏàòÏπò Í≤ΩÍ≥†" Î≥µÌï© Î≥ÄÏàò ÎßåÎì§Í∏∞
    # "Î∂ÄÏ¢ÖÏù¥ ÏïΩÍ∞Ñ(S) ÏûàÏúºÎ©¥ÏÑú & chem_01 ÏàòÏπòÍ∞Ä Í≤ΩÍ≥† ÏàòÏ§Ä(2~4)Ïù∏ ÏÇ¨Îûå"ÏùÑ Ï∞æÏïÑÎÉÖÎãàÎã§.
    # Ïù¥ Ï°∞Ìï©ÏùÄ Target 1Ïùº ÌôïÎ•†Ïù¥ Îß§Ïö∞ ÎÜíÏùÑ Í≤ÉÏûÖÎãàÎã§.

    def check_ambiguous_risk(row):
        # Ï¶ùÏÉÅÏùÄ Ïï†Îß§ÌïòÏßÄÎßå(S), ÏàòÏπòÎäî ÎÇòÏÅú Í≤ΩÏö∞
        if (row['swelling'] == 'S') and (2.0 <= row['chem_01'] < 5.0):
            return 'Target_1_Suspect' # Ï£ºÏùò Îã®Í≥Ñ ÏùòÏã¨
        elif (row['swelling'] == 'Y') or (row['chem_01'] >= 5.0):
            return 'High_Risk'        # ÏúÑÌóò
        else:
            return 'Normal'           # Ï†ïÏÉÅ

    df['risk_segment'] = df.apply(check_ambiguous_risk, axis=1)

    return df


def add_discrete_caution_features(X: pd.DataFrame) -> pd.DataFrame:
    """
    Ï£ºÏùò(1) ÌõÑÎ≥¥ Íµ¨Í∞ÑÏö© Ïù¥ÏÇ∞ ÌîºÏ≤òÎì§
    """
    X = X.copy()

    for c in ["fluid_accum", "organ_enlarge", "vascular_marks"]:
        if c in X.columns:
            X[c] = X[c].replace({"Y": 1, "N": 0})

    if "swelling_ord" not in X.columns and "swelling" in X.columns:
        X["swelling_ord"] = X["swelling"].map({"N": 0, "S": 1, "Y": 2})

    for c in ["symptom_count", "disease_stage", "behavior_index"]:
        if c not in X.columns:
            X[c] = 0

    # 1) Ï§ëÍ∞Ñ Î≥ëÍ∏∞(2~3), Ï§ëÍ∞Ñ ÌñâÎèô(=1)
    X["stage_mid"] = X["disease_stage"].between(2, 3).astype(int)
    X["beh_mid"]   = (X["behavior_index"] == 1).astype(int)

    # 2) Ï¶ùÏÉÅ 1~2Í∞ú + Î∂ÄÏ¢Ö(>=1) ‚Üí Ïï†Îß§ÌïòÍ≤å Ïïà Ï¢ãÏùÄ Íµ¨Í∞Ñ
    X["caution_symptom_zone"] = (
        X["symptom_count"].between(1, 2) & (X["swelling_ord"] >= 1)
    ).astype(int)

    # 3) Î≥ëÍ∏∞ 2~3 + ÌñâÎèôÏßÄÏàò 1
    X["caution_stage_beh"] = (
        (X["stage_mid"] == 1) & (X["beh_mid"] == 1)
    ).astype(int)

    # 4) Ïû•Í∏∞ÎπÑÎåÄ ÏûàÏßÄÎßå Î≥ëÍ∏∞Îäî 1, ÌñâÎèôÏùÄ Ïã¨Í∞ÅÌïòÏßÑ ÏïäÏùÄ Íµ¨Í∞Ñ
    X["caution_organ_only"] = (
        (X["organ_enlarge"] == 1)
        & (X["disease_stage"] == 1)
        & (X["behavior_index"] <= 1)
    ).astype(int)

    # 5) Ï†êÏàòÌòï
    sym_score = X["symptom_count"].clip(0, 3)
    stage_score = X["stage_mid"]
    beh_score = X["beh_mid"]

    X["caution_score"] = sym_score + stage_score + beh_score

    return X


def add_lab_qbins(X: pd.DataFrame, n_bins: int = 4) -> pd.DataFrame:
    """
    lab ÏàòÏπòÌòïÏóê ÎåÄÌï¥ Î∂ÑÏúÑ Í∏∞Î∞ò qbin Ï∂îÍ∞Ä
    chem_01_qbin, chem_02_qbin, protein_level_qbin, trace_metal_qbin,
    enzyme_A_qbin, enzyme_B_qbin, lipid_index_qbin, blood_cells_qbin,
    clot_time_qbin Îì±
    """
    X = X.copy()

    lab_cols = [
        "chem_01", "chem_02", "protein_level", "trace_metal",
        "enzyme_A", "enzyme_B", "lipid_index", "blood_cells", "clot_time"
    ]

    for col in lab_cols:
        if col not in X.columns:
            continue
        try:
            # qbin: 0 ~ (n_bins-1)
            X[f"{col}_qbin"] = pd.qcut(
                X[col],
                q=n_bins,
                labels=False,
                duplicates="drop"
            )
        except ValueError:
            # Ïú†ÎãàÌÅ¨Í∞Ä ÎÑàÎ¨¥ Ï†ÅÏúºÎ©¥ Ïä§ÌÇµ
            continue

    return X


def build_features(df: pd.DataFrame, is_train: bool = True):
    """
    train/test Í≥µÌÜµ Ï†ÑÏ≤¥ ÌååÏù¥ÌîÑÎùºÏù∏:
    - raw Ï†ïÎ¶¨ ‚Üí boosting ÌîºÏ≤ò ‚Üí caution ÌîºÏ≤ò ‚Üí qbin ÌîºÏ≤ò
    - trainÏù¥Î©¥ (X, y), testÎ©¥ X Î∞òÌôò
    """
    df = df.copy()
    df = prepare_raw(df)
    df = make_boosting_features(df)
    df = add_discrete_caution_features(df)
    df = add_lab_qbins(df, n_bins=4)

    drop_cols = [c for c in ["index", "name", "birth_date", "geo_code"] if c in df.columns]

    if is_train:
        y = df["target"].values
        X = df.drop(columns=drop_cols + ["target"])
        return X, y
    else:
        X = df.drop(columns=drop_cols, errors="ignore")
        return X


In [None]:
# %% [markdown]
# # 3. Îç∞Ïù¥ÌÑ∞ Î°úÎìú & Ï†ÑÏ≤¥ ÌîºÏ≤ò ÏÉùÏÑ±

# %%
# ÌååÏùº Í≤ΩÎ°úÎäî ÌôòÍ≤ΩÏóê ÎßûÍ≤å ÏàòÏ†ï
train_path = r"C:\Users\abc01\OneDrive\Î∞îÌÉï ÌôîÎ©¥\train.csv"
test_path  = r"C:\Users\abc01\OneDrive\Î∞îÌÉï ÌôîÎ©¥\test.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

train_df = add_eda_rule_features(train_df)
test_df  = add_eda_rule_features(test_df)

X_all, y = build_features(train_df, is_train=True)
X_all_test = build_features(test_df, is_train=False)

print("X_all shape:", X_all.shape)
print("X_all_test shape:", X_all_test.shape)


X_all shape: (6500, 143)
X_all_test shape: (1405, 143)


  df[col] = df[col].replace({"Y": 1, "N": 0})
  mean_val = df.groupby(group_col)[col].transform('mean')
  std_val  = df.groupby(group_col)[col].transform('std')
  max_val  = df.groupby(group_col)[col].transform('max')
  min_val  = df.groupby(group_col)[col].transform('min')
  mean_val = df.groupby(group_col)[col].transform('mean')
  std_val  = df.groupby(group_col)[col].transform('std')
  max_val  = df.groupby(group_col)[col].transform('max')
  min_val  = df.groupby(group_col)[col].transform('min')
  mean_val = df.groupby(group_col)[col].transform('mean')
  std_val  = df.groupby(group_col)[col].transform('std')
  max_val  = df.groupby(group_col)[col].transform('max')
  min_val  = df.groupby(group_col)[col].transform('min')
  mean_val = df.groupby(group_col)[col].transform('mean')
  std_val  = df.groupby(group_col)[col].transform('std')
  max_val  = df.groupby(group_col)[col].transform('max')
  min_val  = df.groupby(group_col)[col].transform('min')
  mean_val = df.groupby(group_col)[col

In [7]:
# %% [markdown]
# # 4. Î™®Îç∏Î≥Ñ ÌîºÏ≤ò ÏÑúÎ∏åÏÖã (XGB / LGBM / CAT)

# %%
# 4-1. LGBM: ÎØ∏Îì§ ÌîºÏ≤ò 11Í∞ú
LGBM_FEATS = [
    'obs_days', 'chem_02', 'enzyme_B', 'blood_cells',
    'lipid_index', 'clot_time', 'chem_01', 'trace_metal',
    'age', 'enzyme_A', 'health_index'
]

missing_lgb = [c for c in LGBM_FEATS if c not in X_all.columns]
print("LGBM missing feats:", missing_lgb)

X_lgb = X_all[LGBM_FEATS].copy()
X_lgb_test = X_all_test[LGBM_FEATS].copy()
print("X_lgb shape:", X_lgb.shape)


LGBM missing feats: []
X_lgb shape: (6500, 11)


In [8]:
# %%
XGB_FEATS = [
    'chem01_trace_combo', 'chem_01_rank', 'chem_01', 'stage_symptom_interaction',
    'swelling_ord', 'symptom_severity', 'obs_days', 'health_per_stage', 'gender',
    'symptom_count', 'enzyme_A_qbin', 'chem01_enzymeB_combo', 'fluid_accum',
    'disease_velocity', 'obs_per_age', 'age', 'clot_time_qbin', 'disease_stage',
    'clot_time', 'enzyme_B_by_gender_minmax', 'chem_01_qbin', 'trace_metal_qbin',
    'enzyme_A_by_gender_zscore', 'health_index', 'enzyme_A_rank',
    'caution_symptom_zone', 'enzyme_B_qbin', 'enzyme_A_by_gender_diff_mean',
    'chem01_chem02_combo', 'enzyme_A_by_gender_ratio_mean', 'trace_metal',
    'treatment', 'enzyme_B_by_disease_stage_zscore',
    'enzyme_A_by_age_group_minmax', 'blood_cells', 'enzyme_B_rank', 'enzyme_B',
    'diff_chem', 'blood_cells_by_age_group_zscore', 'enzyme_B_by_gender_diff_mean',
    'enzyme_A', 'enzyme_A_by_disease_stage_minmax', 'chem_02_qbin', 'diff_enzyme'
]

missing_xgb = [c for c in XGB_FEATS if c not in X_all.columns]
print("XGB missing feats:", missing_xgb)

X_xgb = X_all[XGB_FEATS].copy()
X_xgb_test = X_all_test[XGB_FEATS].copy()
print("X_xgb shape:", X_xgb.shape)


XGB missing feats: []
X_xgb shape: (6500, 44)


In [9]:
# %%  üî• CatBoostÏö© ÌîºÏ≤ò ÏÖãÏóÖ (ÏÉà ipynbÏö©)

CAT_FEATS = [
    'age', 'obs_days', 'clot_time', 'disease_velocity',
    'chem01_trace_combo', 'chem_01', 'chem01_enzymeB_combo',
    'chem01_chem02_combo', 'combo_chem1_trace', 'obs_per_age',
    'health_index', 'age_enzymeA', 'health_per_stage', 'lipid_per_age',
    'enzyme_A_by_gender_zscore', 'chem_01_rank', 'chem02_trace_ratio',
    'ratio_chem1_trace', 'enzyme_A_per_obs', 'trace_metal',
    'immune_per_age', 'enzyme_ratio', 'immune_per_obs', 'sum_chem',
    'chem_balance', 'enzyme_B_by_disease_stage_minmax',
    'protein_level_by_disease_stage_minmax', 'enzyme_A_per_age',
    'treatment_immune_ratio', 'enzyme_B_by_disease_stage_diff_mean',
    'diff_chem', 'enzyme_A_by_disease_stage_diff_mean',
    'lipid_blood_ratio', 'enzyme_B_per_age', 'stage_symptom_interaction',
    'lipid_index', 'blood_cells', 'chem_02', 'disease_stage',
    'trace_metal_rank', 'enzyme_B', 'enzyme_A'
]

# 1) X_all / X_all_testÏóê ÏóÜÎäî ÌååÏÉù ÌîºÏ≤ò 3Í∞ú ÏßÅÏ†ë ÏÉùÏÑ±
for df_name, df in [("train", X_all), ("test", X_all_test)]:
    cols = set(df.columns)

    # combo_chem1_trace = chem_01 * trace_metal
    if "combo_chem1_trace" not in cols and {"chem_01", "trace_metal"} <= cols:
        df["combo_chem1_trace"] = df["chem_01"] * df["trace_metal"]
        print(f"[{df_name}] combo_chem1_trace ÏÉùÏÑ±")

    # ratio_chem1_trace = chem_01 / (trace_metal + 1e-3)
    if "ratio_chem1_trace" not in cols and {"chem_01", "trace_metal"} <= cols:
        df["ratio_chem1_trace"] = df["chem_01"] / (df["trace_metal"] + 1e-3)
        print(f"[{df_name}] ratio_chem1_trace ÏÉùÏÑ±")

    # age_enzymeA = age * enzyme_A
    cols = set(df.columns)  # ÏúÑÏóêÏÑú Ï∂îÍ∞ÄÎêêÏùÑ Ïàò ÏûàÏúºÎãà Í∞±Ïã†
    if "age_enzymeA" not in cols and {"age", "enzyme_A"} <= cols:
        df["age_enzymeA"] = df["age"] * df["enzyme_A"]
        print(f"[{df_name}] age_enzymeA ÏÉùÏÑ±")

# 2) Ïó¨Ï†ÑÌûà ÏóÜÎäî ÌîºÏ≤ò ÏûàÎäîÏßÄ Ï≤¥ÌÅ¨
missing_cat = [c for c in CAT_FEATS if c not in X_all.columns]
print("CAT missing feats (after fix):", missing_cat)

# 3) Ïã§Ï†ú Ï°¥Ïû¨ÌïòÎäî ÌîºÏ≤òÎßå ÏÇ¨Ïö© (ÏïàÏ†ÑÏû•Ïπò)
use_cat_feats = [c for c in CAT_FEATS if c in X_all.columns]
print("ÏÇ¨Ïö© Í∞ÄÎä•Ìïú CAT ÌîºÏ≤ò Ïàò:", len(use_cat_feats))

X_cat = X_all[use_cat_feats].copy()
X_cat_test = X_all_test[use_cat_feats].copy()
print("X_cat shape:", X_cat.shape)
print("X_cat_test shape:", X_cat_test.shape)


[train] combo_chem1_trace ÏÉùÏÑ±
[train] ratio_chem1_trace ÏÉùÏÑ±
[train] age_enzymeA ÏÉùÏÑ±
[test] combo_chem1_trace ÏÉùÏÑ±
[test] ratio_chem1_trace ÏÉùÏÑ±
[test] age_enzymeA ÏÉùÏÑ±
CAT missing feats (after fix): []
ÏÇ¨Ïö© Í∞ÄÎä•Ìïú CAT ÌîºÏ≤ò Ïàò: 42
X_cat shape: (6500, 42)
X_cat_test shape: (1405, 42)


In [10]:
# %% [markdown]
# # 5. Î™®Îç∏ Ï†ïÏùò (best params Ï†ÅÏö©)

# %%
# 5-1. XGB
best_params_xgb = {
    "max_depth": 7,
    "min_child_weight": 9,
    "gamma": 0.584258780685998,
    "learning_rate": 0.026313469973063427,
    "n_estimators": 400,
    "subsample": 0.7263129521191727,
    "colsample_bytree": 0.6546519462568459,
    "reg_alpha": 0.7771992619497006,
    "reg_lambda": 0.774928257935245,
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
}

model_xgb = XGBClassifier(**best_params_xgb)


# 5-2. LGBM (mid 11 feats)
best_params_lgb = {
    "num_leaves": 22,
    "max_depth": 9,
    "min_child_samples": 84,
    "learning_rate": 0.02556093432307773,
    "n_estimators": 347,
    "subsample": 0.9121058633121536,
    "colsample_bytree": 0.5175827719114479,
    "reg_alpha": 1.5903402527689057,
    "reg_lambda": 1.1779816064263182,
    "objective": "multiclass",
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
}

model_lgb = LGBMClassifier(**best_params_lgb)


# 5-3. CatBoost (ÎÑ§Í∞Ä ÌäúÎãùÌïú Í≤∞Í≥ºÎ°ú ÎÇòÏ§ëÏóê ÏóÖÎç∞Ïù¥Ìä∏)
best_params_cat = {
    "iterations": 1439,   # ÏûêÎ¶¨ÌëúÏãúÏûê: ÎÑàÍ∞Ä Ï∞æÏùÄ bestÎ°ú ÍµêÏ≤¥
    "depth": 4,
    "learning_rate": 0.05302188493570401,
    "l2_leaf_reg": 1.09403474693172,
    "bagging_temperature": 0.48512743870498,
    "random_strength": 1.5516425976084334,
    "border_count": 89,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "random_state": RANDOM_STATE,
    "thread_count": -1,
    "verbose": False,
}

model_cat = CatBoostClassifier(**best_params_cat)


In [11]:
# %% [markdown]
# # 6. 3Í∞ú Î™®Îç∏ OOF ÌôïÎ•† ÏÉùÏÑ±

# %%
oof_xgb, _ = get_oof_proba(model_xgb, X_xgb, y, name="XGB")
oof_lgb, _ = get_oof_proba(model_lgb, X_lgb, y, name="LGBM")
oof_cat, _ = get_oof_proba(model_cat, X_cat, y, name="CAT")

print("oof_xgb shape:", oof_xgb.shape)
print("oof_lgb shape:", oof_lgb.shape)
print("oof_cat shape:", oof_cat.shape)


[XGB] Fold 1/10 mAP = 0.691488
[XGB] Fold 2/10 mAP = 0.682504
[XGB] Fold 3/10 mAP = 0.663161
[XGB] Fold 4/10 mAP = 0.713433
[XGB] Fold 5/10 mAP = 0.759052
[XGB] Fold 6/10 mAP = 0.729224
[XGB] Fold 7/10 mAP = 0.663906
[XGB] Fold 8/10 mAP = 0.692875
[XGB] Fold 9/10 mAP = 0.766981
[XGB] Fold 10/10 mAP = 0.659785
[XGB] OOF mean mAP = 0.702241
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2585
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.356077
[LightGBM] [Info] Start training from score -1.086380
[LGBM] Fold 1/10 mAP = 0.695395
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Inf



[LGBM] Fold 2/10 mAP = 0.675049
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.356077
[LightGBM] [Info] Start training from score -1.086380








[LGBM] Fold 3/10 mAP = 0.670266
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2586
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.356077
[LightGBM] [Info] Start training from score -1.086380
[LGBM] Fold 4/10 mAP = 0.697007
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873








[LGBM] Fold 5/10 mAP = 0.727913
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2585
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873




[LGBM] Fold 6/10 mAP = 0.703766
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2585
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873




[LGBM] Fold 7/10 mAP = 0.659506
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873




[LGBM] Fold 8/10 mAP = 0.728803
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2586
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873




[LGBM] Fold 9/10 mAP = 0.764266
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2584
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873
[LGBM] Fold 10/10 mAP = 0.670588
[LGBM] OOF mean mAP = 0.699256




[CAT] Fold 1/10 mAP = 0.681933
[CAT] Fold 2/10 mAP = 0.660296
[CAT] Fold 3/10 mAP = 0.664877
[CAT] Fold 4/10 mAP = 0.716398
[CAT] Fold 5/10 mAP = 0.743040
[CAT] Fold 6/10 mAP = 0.715236
[CAT] Fold 7/10 mAP = 0.651158
[CAT] Fold 8/10 mAP = 0.684367
[CAT] Fold 9/10 mAP = 0.729313
[CAT] Fold 10/10 mAP = 0.667796
[CAT] OOF mean mAP = 0.691441
oof_xgb shape: (6500, 3)
oof_lgb shape: (6500, 3)
oof_cat shape: (6500, 3)


In [12]:
# %% [markdown]
# # 7. ÏïôÏÉÅÎ∏î Í∞ÄÏ§ëÏπò grid search (mAP ÏµúÎåÄÌôî)

# %%
from itertools import product

def search_ensemble_weights(
    y_true,
    oof_xgb,
    oof_lgb,
    oof_cat,
    step=0.1
):
    best_score = -1.0
    best_w = (1/3, 1/3, 1/3)

    ws = [round(x, 2) for x in np.arange(0.0, 1.0 + 1e-9, step)]

    for w1, w2, w3 in product(ws, ws, ws):
        if w1 + w2 + w3 == 0:
            continue
        s = w1 + w2 + w3
        w1_n, w2_n, w3_n = w1/s, w2/s, w3/s

        proba_ens = (
            w1_n * oof_xgb +
            w2_n * oof_lgb +
            w3_n * oof_cat
        )

        score = map_macro(y_true, proba_ens)

        if score > best_score:
            best_score = score
            best_w = (w1_n, w2_n, w3_n)

    return best_w, best_score


best_w, best_score = search_ensemble_weights(
    y,
    oof_xgb,
    oof_lgb,
    oof_cat,
    step=0.05   # 0.05Î°ú Îçî ÏÑ∏Î∞ÄÌïòÍ≤å Í∞ÄÎèÑ Îê®(ÏãúÍ∞Ñ Ï°∞Í∏à Ï¶ùÍ∞Ä)
)

print("\n=== ÏïôÏÉÅÎ∏î weight search Í≤∞Í≥º ===")
print("Best ensemble mAP:", best_score)
print("Best weights (XGB, LGBM, CAT):", best_w)



=== ÏïôÏÉÅÎ∏î weight search Í≤∞Í≥º ===
Best ensemble mAP: 0.6932049158288831
Best weights (XGB, LGBM, CAT): (0.425531914893617, 0.3191489361702127, 0.2553191489361702)


In [19]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 0. Îç∞Ïù¥ÌÑ∞ / ÌîºÏ≤ò Ï§ÄÎπÑ (Ïù¥ÎØ∏ ÏûàÎã§Í≥† Í∞ÄÏ†ï)
#    - train_df, test_df
#    - y
#    - X_xgb, X_xgb_test
#    - X_lgb, X_lgb_test
#    - X_cat, X_cat_test
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# ÏúÑ Í≤ÉÎì§ÏùÄ ÎÑàÍ∞Ä Ïù¥ÎØ∏ Ï†ÑÏ≤òÎ¶¨ + ÌîºÏ≤òÏÖÄÎ†âÍπåÏßÄ ÎÅùÎÇ∏ ÏÉÅÌÉúÏóêÏÑú Ïì∞Îçò Í≤É Í∑∏ÎåÄÎ°ú ÏÇ¨Ïö©ÌïòÎ©¥ Îê®

print("X_xgb:", X_xgb.shape, "X_xgb_test:", X_xgb_test.shape)
print("X_lgb:", X_lgb.shape, "X_lgb_test:", X_lgb_test.shape)
print("X_cat:", X_cat.shape, "X_cat_test:", X_cat_test.shape)

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 1. Í∞Å Î™®Îç∏ ÏµúÏ¢Ö ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÏÑ∏ÌåÖ
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

# 1) XGBoost (final_features Í∏∞Ï§Ä Optuna best)
xgb_final = XGBClassifier(
    max_depth=7,
    min_child_weight=9,
    gamma=0.584258780685998,
    learning_rate=0.026313469973063427,
    n_estimators=400,
    subsample=0.7263129521191727,
    colsample_bytree=0.6546519462568459,
    reg_alpha=0.7771992619497006,
    reg_lambda=0.774928257935245,
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

# 2) LightGBM (ÎØ∏Îì§ ÌîºÏ≤ò 11Í∞ú Optuna best)
lgb_final = LGBMClassifier(
    n_estimators=347,
    num_leaves=22,
    max_depth=9,
    min_child_samples=84,
    learning_rate=0.02556093432307773,
    subsample=0.9121058633121536,
    colsample_bytree=0.5175827719114479,
    reg_alpha=1.5903402527689057,
    reg_lambda=1.1779816064263182,
    objective="multiclass",
    num_class=3,
    n_jobs=-1,
    random_state=42
)

# 3) CatBoost (CAT_FEATS Í∏∞Ï§Ä Optuna best)
cat_final = CatBoostClassifier(
    iterations=1325,
    depth=3,
    learning_rate=0.047163049101853795,
    l2_leaf_reg=0.5524276151256475,
    bagging_temperature=1.3500372949148198,
    random_strength=1.474723113115535,
    border_count=74,
    loss_function="MultiClass",
    eval_metric="MultiClass",
    verbose=False,
    random_state=42
)

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 2. train Ï†ÑÏ≤¥Î°ú Í∞Å Î™®Îç∏ ÌïôÏäµ
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

print("\n[Train] XGB full fit...")
xgb_final.fit(X_xgb, y)

print("[Train] LGBM full fit...")
lgb_final.fit(X_lgb, y)

print("[Train] CAT full fit...")
cat_final.fit(X_cat, y)

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 3. test ÌôïÎ•† ÏòàÏ∏°
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

proba_xgb_test = xgb_final.predict_proba(X_xgb_test)
proba_lgb_test = lgb_final.predict_proba(X_lgb_test)
proba_cat_test = cat_final.predict_proba(X_cat_test)

print("proba_xgb_test:", proba_xgb_test.shape)
print("proba_lgb_test:", proba_lgb_test.shape)
print("proba_cat_test:", proba_cat_test.shape)

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 4. Í∞ÄÏ§ë ÏïôÏÉÅÎ∏î (ÎÑàÍ∞Ä Ï∞æÏùÄ best weights ÏÇ¨Ïö©)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

w_xgb = 0.425531914893617
w_lgb = 0.3191489361702127
w_cat = 0.2553191489361702

proba_test_ens = (
    w_xgb * proba_xgb_test +
    w_lgb * proba_lgb_test +
    w_cat * proba_cat_test
)

# ÏµúÏ¢Ö predicted class (argmax)
pred_test = np.argmax(proba_test_ens, axis=1)

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# 5. CSV Ï†ÄÏû• (‚úî probaÎßå, Î∞îÌÉïÌôîÎ©¥ submission.csv)
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

# proba_test_ens: (n_samples, 3)  ‚Üê Í∞ÄÏ§ë ÏïôÏÉÅÎ∏î ÌôïÎ•†
proba_df = pd.DataFrame(
    proba_test_ens,
    columns=["prob_class_0", "prob_class_1", "prob_class_2"]
)

# index Î∂ôÏù¥Í∏∞
proba_df.insert(0, "index", test_df["index"].values)

output_path = r"C:\Users\abc01\OneDrive\Î∞îÌÉï ÌôîÎ©¥\submission.csv"
proba_df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"‚úÖ ÌôïÎ•† Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû• ÏôÑÎ£å: {output_path}")




X_xgb: (6500, 44) X_xgb_test: (1405, 44)
X_lgb: (6500, 11) X_lgb_test: (1405, 11)
X_cat: (6500, 42) X_cat_test: (1405, 42)

[Train] XGB full fit...
[Train] LGBM full fit...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 6500, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.359022
[LightGBM] [Info] Start training from score -1.086076
[Train] CAT full fit...
proba_xgb_test: (1405, 3)
proba_lgb_test: (1405, 3)
proba_cat_test: (1405, 3)
‚úÖ ÌôïÎ•† Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû• ÏôÑÎ£å: C:\Users\abc01\OneDrive\Î∞îÌÉï ÌôîÎ©¥\submission.csv


In [14]:
# %% [markdown]
# # 1. 3Í∞ú Î≤†Ïù¥Ïä§ Î™®Îç∏ OOF ÌôïÎ•† (Í∞ÄÏ§ëÏπò X)

# %%
print("===== XGB OOF (no weight) =====")
oof_xgb, _ = get_oof_proba(
    model_xgb,
    X_xgb, y,
    n_splits=10,
    random_state=42,
    name="XGB"
)

print("\n===== LGBM OOF (no weight) =====")
oof_lgb, _ = get_oof_proba(
    model_lgb,
    X_lgb, y,
    n_splits=10,
    random_state=42,
    name="LGBM"
)

print("\n===== CAT OOF (no weight) =====")
oof_cat, _ = get_oof_proba(
    model_cat,
    X_cat, y,
    n_splits=10,
    random_state=42,
    name="CAT"
)

print("oof_xgb:", oof_xgb.shape)
print("oof_lgb:", oof_lgb.shape)
print("oof_cat:", oof_cat.shape)


===== XGB OOF (no weight) =====
[XGB] Fold 1/10 mAP = 0.691488
[XGB] Fold 2/10 mAP = 0.682504
[XGB] Fold 3/10 mAP = 0.663161
[XGB] Fold 4/10 mAP = 0.713433
[XGB] Fold 5/10 mAP = 0.759052
[XGB] Fold 6/10 mAP = 0.729224
[XGB] Fold 7/10 mAP = 0.663906
[XGB] Fold 8/10 mAP = 0.692875
[XGB] Fold 9/10 mAP = 0.766981
[XGB] Fold 10/10 mAP = 0.659785
[XGB] OOF mean mAP = 0.702241

===== LGBM OOF (no weight) =====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2585
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.356077
[LightGBM] [Info] Start training from score -1.086380




[LGBM] Fold 1/10 mAP = 0.695395
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2588
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.356077
[LightGBM] [Info] Start training from score -1.086380
[LGBM] Fold 2/10 mAP = 0.675049
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.356077
[LightGBM] [Info] Start training from score -1.086380








[LGBM] Fold 3/10 mAP = 0.670266
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2586
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.356077
[LightGBM] [Info] Start training from score -1.086380




[LGBM] Fold 4/10 mAP = 0.697007
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873




[LGBM] Fold 5/10 mAP = 0.727913
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2585
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873
[LGBM] Fold 6/10 mAP = 0.703766
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2585
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873








[LGBM] Fold 7/10 mAP = 0.659506
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873
[LGBM] Fold 8/10 mAP = 0.728803
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2586
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873








[LGBM] Fold 9/10 mAP = 0.764266
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2584
[LightGBM] [Info] Number of data points in the train set: 5850, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.360991
[LightGBM] [Info] Start training from score -1.085873




[LGBM] Fold 10/10 mAP = 0.670588
[LGBM] OOF mean mAP = 0.699256

===== CAT OOF (no weight) =====
[CAT] Fold 1/10 mAP = 0.681933
[CAT] Fold 2/10 mAP = 0.660296
[CAT] Fold 3/10 mAP = 0.664877
[CAT] Fold 4/10 mAP = 0.716398
[CAT] Fold 5/10 mAP = 0.743040
[CAT] Fold 6/10 mAP = 0.715236
[CAT] Fold 7/10 mAP = 0.651158
[CAT] Fold 8/10 mAP = 0.684367
[CAT] Fold 9/10 mAP = 0.729313
[CAT] Fold 10/10 mAP = 0.667796
[CAT] OOF mean mAP = 0.691441
oof_xgb: (6500, 3)
oof_lgb: (6500, 3)
oof_cat: (6500, 3)


In [15]:
# %% [markdown]
# # 2. Î©îÌÉÄ ÏûÖÎ†• ÌîºÏ≤ò(Z_train, Z_test) ÏÉùÏÑ±

# %%
import numpy as np
from sklearn.base import clone

# 2-1) trainÏö© meta-feature (OOF Í∏∞Î∞ò)
Z_train = np.hstack([oof_xgb, oof_lgb, oof_cat])
print("Z_train shape:", Z_train.shape)  # (n_samples, 9 ÏòàÏÉÅ)

# 2-2) base Î™®Îç∏ÏùÑ Ï†ÑÏ≤¥ trainÏúºÎ°ú Îã§Ïãú ÌïôÏäµÌï¥ÏÑú test ÌôïÎ•† ÏÉùÏÑ±
final_xgb = clone(model_xgb)
final_xgb.fit(X_xgb, y)
proba_xgb_test = final_xgb.predict_proba(X_xgb_test)

final_lgb = clone(model_lgb)
final_lgb.fit(X_lgb, y)
proba_lgb_test = final_lgb.predict_proba(X_lgb_test)

final_cat = clone(model_cat)
final_cat.fit(X_cat, y)
proba_cat_test = final_cat.predict_proba(X_cat_test)

Z_test = np.hstack([proba_xgb_test, proba_lgb_test, proba_cat_test])
print("Z_test shape:", Z_test.shape)


Z_train shape: (6500, 9)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 6500, number of used features: 11
[LightGBM] [Info] Start training from score -0.465705
[LightGBM] [Info] Start training from score -3.359022
[LightGBM] [Info] Start training from score -1.086076
Z_test shape: (1405, 9)


In [16]:
import numpy as np
from sklearn.metrics import average_precision_score

def search_best_pair_weight(oof_a, oof_b, y, name_a="A", name_b="B", n_grid=101):
    """
    oof_a, oof_b : (n_samples, n_classes) OOF ÌôïÎ•†
    y            : Ï†ïÎãµ Î†àÏù¥Î∏î
    n_grid       : 0~1 ÏÇ¨Ïù¥Î•º Î™á Í∞úÎ°ú ÎÇòÎàåÏßÄ (101 -> 0.00,0.01,...,1.00)
    """
    best_w = None
    best_score = -1.0
    history = []

    ws = np.linspace(0, 1, n_grid)
    for w in ws:
        proba = w * oof_a + (1 - w) * oof_b
        score = average_precision_score(y, proba, average="macro")
        history.append((w, score))
        # ÌïÑÏöîÌïòÎ©¥ Ïó¨Í∏∞ÏÑú print(f"{name_a}={w:.3f}, {name_b}={1-w:.3f}, mAP={score:.6f}")
        if score > best_score:
            best_score = score
            best_w = w

    print(f"\nüîé [{name_a} + {name_b}] best mAP = {best_score:.6f}, "
          f"weights=({name_a}={best_w:.3f}, {name_b}={1-best_w:.3f})")

    return best_w, best_score, history


In [17]:
# Ïù¥ÎØ∏ ÏûàÎäî OOF:
# oof_xgb, oof_lgb, oof_cat
# y

w_xl, sc_xl, hist_xl = search_best_pair_weight(oof_xgb, oof_lgb, y, "XGB", "LGBM")
w_xc, sc_xc, hist_xc = search_best_pair_weight(oof_xgb, oof_cat, y, "XGB", "CAT")
w_lc, sc_lc, hist_lc = search_best_pair_weight(oof_lgb, oof_cat, y, "LGBM", "CAT")

print("\n===== 2-Î™®Îç∏ ÏïôÏÉÅÎ∏î ÎπÑÍµê =====")
print(f"XGB + LGBM : {sc_xl:.6f} (w={w_xl:.3f}/{1-w_xl:.3f})")
print(f"XGB + CAT  : {sc_xc:.6f} (w={w_xc:.3f}/{1-w_xc:.3f})")
print(f"LGBM + CAT : {sc_lc:.6f} (w={w_lc:.3f}/{1-w_lc:.3f})")



üîé [XGB + LGBM] best mAP = 0.691253, weights=(XGB=0.500, LGBM=0.500)

üîé [XGB + CAT] best mAP = 0.691218, weights=(XGB=0.670, CAT=0.330)

üîé [LGBM + CAT] best mAP = 0.690198, weights=(LGBM=0.510, CAT=0.490)

===== 2-Î™®Îç∏ ÏïôÏÉÅÎ∏î ÎπÑÍµê =====
XGB + LGBM : 0.691253 (w=0.500/0.500)
XGB + CAT  : 0.691218 (w=0.670/0.330)
LGBM + CAT : 0.690198 (w=0.510/0.490)


In [20]:
# Ïù¥ÎØ∏ ÏûàÎäî test proba
# proba_xgb_test, proba_lgb_test, proba_cat_test
# test_df (index Ìè¨Ìï®)

# 1) Ï†úÏùº Ï¢ãÏùÄ Ï°∞Ìï© ÏûêÎèô ÏÑ†ÌÉù
scores_pairs = {
    "XL": (sc_xl, w_xl),  # XGB + LGBM
    "XC": (sc_xc, w_xc),  # XGB + CAT
    "LC": (sc_lc, w_lc),  # LGBM + CAT
}

best_key = max(scores_pairs.keys(), key=lambda k: scores_pairs[k][0])
best_score, best_w = scores_pairs[best_key]
print(f"\nüî• ÏµúÏ¢Ö ÏÑ†ÌÉùÎêú Ï°∞Ìï©: {best_key}, mAP={best_score:.6f}, w={best_w:.3f}/{1-best_w:.3f}")

# 2) test ÌôïÎ•† ÏïôÏÉÅÎ∏î
if best_key == "XL":
    proba_test_ens_2 = best_w * proba_xgb_test + (1 - best_w) * proba_lgb_test
elif best_key == "XC":
    proba_test_ens_2 = best_w * proba_xgb_test + (1 - best_w) * proba_cat_test
elif best_key == "LC":
    proba_test_ens_2 = best_w * proba_lgb_test + (1 - best_w) * proba_cat_test
else:
    raise ValueError("Ïïå Ïàò ÏóÜÎäî Ï°∞Ìï© ÌÇ§:", best_key)

print("proba_test_ens_2 shape:", proba_test_ens_2.shape)

# 3) proba Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± (index + ÌôïÎ•† 3Í∞ú)
proba_df_2 = pd.DataFrame(
    proba_test_ens_2,
    columns=["prob_class_0", "prob_class_1", "prob_class_2"]
)
proba_df_2.insert(0, "index", test_df["index"].values)

output_path = r"C:\Users\abc01\OneDrive\Î∞îÌÉï ÌôîÎ©¥\submission.csv"
proba_df_2.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"‚úÖ 2-Î™®Îç∏ ÏïôÏÉÅÎ∏î proba Ï†ÄÏû• ÏôÑÎ£å: {output_path}")



üî• ÏµúÏ¢Ö ÏÑ†ÌÉùÎêú Ï°∞Ìï©: XL, mAP=0.691253, w=0.500/0.500
proba_test_ens_2 shape: (1405, 3)
‚úÖ 2-Î™®Îç∏ ÏïôÏÉÅÎ∏î proba Ï†ÄÏû• ÏôÑÎ£å: C:\Users\abc01\OneDrive\Î∞îÌÉï ÌôîÎ©¥\submission.csv


ÏõêÎ≥∏ train: (6500, 24) test: (1405, 23)


NameError: name 'preprocess_all' is not defined