In [1]:
# Patrick

# 1. Import Library

In [99]:
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from collections import Counter
import torch, gc
import shutil
import joblib

torch.cuda.empty_cache()
gc.collect()
use_gpu = shutil.which("nvidia-smi") is not None
print("Using GPU" if use_gpu else "Using CPU")

RANDOM_SEED = 1024
DATA_PATH = "../dataset.csv"
OUT_MODEL = 'output/xgb_model.json'
OUT_CSV = 'output/confusion_matrix.csv'
OUT_LE = "output/label_encoder.pkl"

Using GPU


# 2. Data Preprocess

In [83]:
# Remove Duplicate
df = (
    pd.read_csv(DATA_PATH)
      .drop_duplicates()
      .reset_index(drop=True)
)
len(df)

189647

In [84]:
# Features / Label
X = df.drop("diseases", axis=1)
y_raw = df["diseases"].str.strip()
le = LabelEncoder().fit(y_raw)
y = pd.Series(le.transform(y_raw), index=X.index)
y.head()

0    531
1    531
2    531
3    531
4    531
dtype: int64

In [85]:
# Find Rare Cases
counts = Counter(y)
common_classes = [cls for cls, cnt in counts.items() if cnt >= 2]
rare_classes   = [cls for cls, cnt in counts.items() if cnt <  2]

mask_common = np.isin(y, common_classes)
mask_rare   = ~mask_common

X_common, y_common = X[mask_common], y[mask_common]
X_rare,   y_rare   = X[mask_rare],   y[mask_rare]

In [86]:
# Test
X_comm_train_val, X_test, y_comm_train_val, y_test = train_test_split(
    X_common, y_common,
    test_size=0.2,
    stratify=y_common,
    random_state=RANDOM_SEED
)

In [87]:
# Rare Cases -> Train/Val
X_train_val = np.concatenate([X_comm_train_val, X_rare], axis=0)
y_train_val = np.concatenate([y_comm_train_val, y_rare],   axis=0)

# Random
perm = np.random.RandomState(RANDOM_SEED).permutation(len(y_train_val))
X_train_val = X_train_val[perm]
y_train_val = y_train_val[perm]

In [88]:
# Val
mask_tv_common = np.isin(y_train_val, common_classes)
X_tv_common    = X_train_val[mask_tv_common]
y_tv_common    = y_train_val[mask_tv_common]

X_train_comm, X_val, y_train_comm, y_val = train_test_split(
    X_tv_common, y_tv_common,
    test_size=0.1,
    stratify=y_tv_common,
    random_state=RANDOM_SEED
)

In [89]:
# Train
mask_tv_rare = ~mask_tv_common
X_tv_rare    = X_train_val[mask_tv_rare]
y_tv_rare    = y_train_val[mask_tv_rare]

X_train = np.concatenate([X_train_comm, X_tv_rare], axis=0)
y_train = np.concatenate([y_train_comm, y_tv_rare],   axis=0)

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

Train: 136558 | Val: 15168 | Test: 37921


# 3. Training

In [90]:
def lr_scheduler(round_idx,
                 base_lr=0.03,
                 min_lr=0.003,
                 warmup_rounds=500,
                 total_rounds=15000):
    if round_idx < warmup_rounds:
        return base_lr * ( round_idx+1 )/ warmup_rounds

    decay_rounds = max(1, total_rounds - warmup_rounds)
    progress  = (round_idx - warmup_rounds) / decay_rounds
    return min_lr + 0.5 * (base_lr - min_lr) * (1 + math.cos(progress * math.pi))

In [91]:
params = {
    # basics
    "objective": "multi:softprob",
    "num_class": len(le.classes_),
    "eval_metric": ["mlogloss", "merror"],
    "random_state": RANDOM_SEED,

    # Tree
    "tree_method": "hist",
    'grow_policy': 'lossguide',
    "min_child_weight": 5,
    "max_depth": 0,
    "max_leaves": 256,
    "gamma": 1.0,

    # Reg
    'reg_alpha': 2,
    'reg_lambda': 1,

    # Sub
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "colsample_bylevel": 0.8,
    "colsample_bynode": 0.8,
    "sampling_method": "gradient_based",

    # LR
    "learning_rate": lambda i:lr_scheduler(i),
    "n_estimators": 15000,
    "early_stopping_rounds": 100,

    # Others
    "device": "cuda" if use_gpu else "cpu",
    "n_jobs": -1,
    'max_delta_step': 1,
}

clf = XGBClassifier(**params)
clf.fit(X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=True)

[0]	validation_0-mlogloss:6.27906	validation_0-merror:0.98332
[1]	validation_0-mlogloss:5.91147	validation_0-merror:0.94686
[2]	validation_0-mlogloss:5.54523	validation_0-merror:0.88898
[3]	validation_0-mlogloss:5.18421	validation_0-merror:0.82997
[4]	validation_0-mlogloss:4.83033	validation_0-merror:0.72877
[5]	validation_0-mlogloss:4.48068	validation_0-merror:0.62803
[6]	validation_0-mlogloss:4.13957	validation_0-merror:0.53198
[7]	validation_0-mlogloss:3.80746	validation_0-merror:0.44429
[8]	validation_0-mlogloss:3.48644	validation_0-merror:0.37652
[9]	validation_0-mlogloss:3.17640	validation_0-merror:0.33617
[10]	validation_0-mlogloss:2.87845	validation_0-merror:0.30083
[11]	validation_0-mlogloss:2.59870	validation_0-merror:0.27848
[12]	validation_0-mlogloss:2.33616	validation_0-merror:0.25943
[13]	validation_0-mlogloss:2.09201	validation_0-merror:0.24512
[14]	validation_0-mlogloss:1.87341	validation_0-merror:0.23800
[15]	validation_0-mlogloss:1.67788	validation_0-merror:0.22811
[1

In [95]:
clf.save_model(OUT_MODEL)
joblib.dump(le, OUT_LE)

['output/label_encoder.pkl']

# 4. Evaluation

In [96]:
test_clf = XGBClassifier()
test_clf.load_model("output/xgb_model.json")
test_clf.set_params(tree_method='hist',
               device='cuda' if use_gpu else 'cpu')

y_pred = test_clf.predict(X_test)
test_labels = np.unique(y_test)

print("Classification Report (Test Set)")
print(classification_report(
    y_test, y_pred,
    labels=test_labels,
    target_names=le.inverse_transform(test_labels),
    digits=3,
    zero_division=0
))
print("Accuracy:", accuracy_score(y_test, y_pred))
# 813

Classification Report (Test Set)
                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm      0.667     0.500     0.571         8
                                        abdominal hernia      0.939     0.868     0.902        53
                                         abscess of nose      0.905     0.633     0.745        30
                                     abscess of the lung      0.000     0.000     0.000         1
                                  abscess of the pharynx      0.818     0.818     0.818        33
                                    acanthosis nigricans      0.000     0.000     0.000         2
                                               acariasis      0.000     0.000     0.000         2
                                               achalasia      1.000     0.200     0.333         5
                                                    acne      0.656     0.656     0.

In [97]:
print("\nConfusion Matrix")
cm = confusion_matrix(y_test, y_pred, labels=test_labels)
cm_df = pd.DataFrame(
    cm,
    index=le.inverse_transform(test_labels),
    columns=le.inverse_transform(test_labels)
)
cm_df.to_csv(OUT_CSV, index=True, encoding="utf-8-sig")


Confusion Matrix
