In [1]:
# Patrick

# 1. Import Library

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from collections import Counter

RANDOM_SEED = 1024
DATA_PATH = "../dataset.csv"

# 2. Data Preprocess

In [26]:
# Remove Duplicate
df = (
    pd.read_csv(DATA_PATH)
      .drop_duplicates()
      .reset_index(drop=True)
)
len(df)

189647

In [27]:
# Features / Label
X = df.drop("diseases", axis=1)
y_raw = df["diseases"].str.strip()
le = LabelEncoder().fit(y_raw)
y = pd.Series(le.transform(y_raw), index=X.index)
y.head()

0    531
1    531
2    531
3    531
4    531
dtype: int64

In [28]:
# Find Rare Cases
counts = Counter(y)
common_classes = [cls for cls, cnt in counts.items() if cnt >= 2]
rare_classes   = [cls for cls, cnt in counts.items() if cnt <  2]

mask_common = np.isin(y, common_classes)
mask_rare   = ~mask_common

X_common, y_common = X[mask_common], y[mask_common]
X_rare,   y_rare   = X[mask_rare],   y[mask_rare]

In [29]:
# Test
X_comm_train_val, X_test, y_comm_train_val, y_test = train_test_split(
    X_common, y_common,
    test_size=0.2,
    stratify=y_common,
    random_state=RANDOM_SEED
)

In [30]:
# Rare Cases -> Train/Val
X_train_val = np.concatenate([X_comm_train_val, X_rare], axis=0)
y_train_val = np.concatenate([y_comm_train_val, y_rare],   axis=0)

# Random
perm = np.random.RandomState(RANDOM_SEED).permutation(len(y_train_val))
X_train_val = X_train_val[perm]
y_train_val = y_train_val[perm]

In [31]:
# Val
mask_tv_common = np.isin(y_train_val, common_classes)
X_tv_common    = X_train_val[mask_tv_common]
y_tv_common    = y_train_val[mask_tv_common]

X_train_comm, X_val, y_train_comm, y_val = train_test_split(
    X_tv_common, y_tv_common,
    test_size=0.1,
    stratify=y_tv_common,
    random_state=RANDOM_SEED
)

In [32]:
# Train
mask_tv_rare = ~mask_tv_common
X_tv_rare    = X_train_val[mask_tv_rare]
y_tv_rare    = y_train_val[mask_tv_rare]

X_train = np.concatenate([X_train_comm, X_tv_rare], axis=0)
y_train = np.concatenate([y_train_comm, y_tv_rare],   axis=0)

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

Train: 136558 | Val: 15168 | Test: 37921


# 3. Training

In [33]:
import shutil
import joblib

use_gpu = shutil.which("nvidia-smi") is not None
print("Using GPU" if use_gpu else "Using CPU")

params = {
    "objective": "multi:softprob",
    "num_class": len(le.classes_),
    "eval_metric": "mlogloss",
    "random_state": RANDOM_SEED,
    "tree_method": "hist",
    "device": "cuda" if use_gpu else "cpu",
    "n_jobs": -1,
    "max_depth": 0,
    "max_leaves": 96,
    'reg_alpha': 0.5,
    'reg_lambda': 1.0,
    'grow_policy': 'lossguide',
    "min_child_weight": 3,
    "gamma": 0.2,
    "lambda": 1.0,
    "alpha": 0.5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "colsample_bylevel": 0.8,
    "colsample_bynode": 0.8,
     "learning_rate": 0.05,
    "n_estimators": 5000,
    "early_stopping_rounds": 100,
    "sampling_method": "gradient_based"
}

clf = XGBClassifier(**params)
clf.fit(X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=True)

Using GPU
[0]	validation_0-mlogloss:5.36793
[1]	validation_0-mlogloss:4.66762
[2]	validation_0-mlogloss:4.21964
[3]	validation_0-mlogloss:3.90163
[4]	validation_0-mlogloss:3.64727
[5]	validation_0-mlogloss:3.43128
[6]	validation_0-mlogloss:3.25041
[7]	validation_0-mlogloss:3.09232
[8]	validation_0-mlogloss:2.95518
[9]	validation_0-mlogloss:2.83120
[10]	validation_0-mlogloss:2.72175
[11]	validation_0-mlogloss:2.62054
[12]	validation_0-mlogloss:2.52577
[13]	validation_0-mlogloss:2.43685
[14]	validation_0-mlogloss:2.35618
[15]	validation_0-mlogloss:2.28206
[16]	validation_0-mlogloss:2.21162
[17]	validation_0-mlogloss:2.14684
[18]	validation_0-mlogloss:2.08628
[19]	validation_0-mlogloss:2.02964
[20]	validation_0-mlogloss:1.97556
[21]	validation_0-mlogloss:1.92450
[22]	validation_0-mlogloss:1.87619
[23]	validation_0-mlogloss:1.83064
[24]	validation_0-mlogloss:1.78709
[25]	validation_0-mlogloss:1.74563
[26]	validation_0-mlogloss:1.70583
[27]	validation_0-mlogloss:1.66781
[28]	validation_0-ml

In [37]:
clf.save_model("output/xgb_model.json")
joblib.dump(le, "output/label_encoder.pkl")

['output/label_encoder.pkl']

# 4. Evaluation

In [38]:
test_clf = XGBClassifier()
test_clf.load_model("output/xgb_model.json")
test_clf.set_params(tree_method='hist',
               device='cuda' if use_gpu else 'cpu')

y_pred = test_clf.predict(X_test)
test_labels = np.unique(y_test)

print("Classification Report (Test Set)")
print(classification_report(
    y_test, y_pred,
    labels=test_labels,
    target_names=le.inverse_transform(test_labels),
    digits=3,
    zero_division=0
))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report (Test Set)
                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm      0.857     0.750     0.800         8
                                        abdominal hernia      0.923     0.906     0.914        53
                                         abscess of nose      0.864     0.633     0.731        30
                                     abscess of the lung      0.000     0.000     0.000         1
                                  abscess of the pharynx      0.722     0.788     0.754        33
                                    acanthosis nigricans      0.000     0.000     0.000         2
                                               acariasis      0.000     0.000     0.000         2
                                               achalasia      0.500     0.400     0.444         5
                                                    acne      0.582     0.639     0.

In [39]:
print("\nConfusion Matrix")
cm = confusion_matrix(y_test, y_pred, labels=test_labels)
cm_df = pd.DataFrame(
    cm,
    index=le.inverse_transform(test_labels),
    columns=le.inverse_transform(test_labels)
)
cm_df.to_csv("output/confusion_matrix.csv", index=True, encoding="utf-8-sig")


Confusion Matrix
