In [1]:
# Patrick

# 1. Import Library

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from collections import Counter

RANDOM_SEED = 1024
DATA_PATH = "../../DATA/new_dataset.csv"

# 2. Data Preprocess

In [3]:
# Remove Duplicate
df = (
    pd.read_csv(DATA_PATH)
      .drop_duplicates()
      .reset_index(drop=True)
)
len(df)

189647

In [4]:
# Features / Label
X = df.drop("diseases", axis=1)
y_raw = df["diseases"].str.strip()
le = LabelEncoder().fit(y_raw)
y = pd.Series(le.transform(y_raw), index=X.index)
y.head()

0    531
1    531
2    531
3    531
4    531
dtype: int64

In [5]:
# Find Rare Cases
counts = Counter(y)
common_classes = [cls for cls, cnt in counts.items() if cnt >= 2]
rare_classes   = [cls for cls, cnt in counts.items() if cnt <  2]

mask_common = np.isin(y, common_classes)
mask_rare   = ~mask_common

X_common, y_common = X[mask_common], y[mask_common]
X_rare,   y_rare   = X[mask_rare],   y[mask_rare]

In [6]:
# Test
X_comm_train_val, X_test, y_comm_train_val, y_test = train_test_split(
    X_common, y_common,
    test_size=0.2,
    stratify=y_common,
    random_state=RANDOM_SEED
)

In [7]:
# Rare Cases -> Train/Val
X_train_val = np.concatenate([X_comm_train_val, X_rare], axis=0)
y_train_val = np.concatenate([y_comm_train_val, y_rare],   axis=0)

# Random
perm = np.random.RandomState(RANDOM_SEED).permutation(len(y_train_val))
X_train_val = X_train_val[perm]
y_train_val = y_train_val[perm]

In [8]:
# Val
mask_tv_common = np.isin(y_train_val, common_classes)
X_tv_common    = X_train_val[mask_tv_common]
y_tv_common    = y_train_val[mask_tv_common]

X_train_comm, X_val, y_train_comm, y_val = train_test_split(
    X_tv_common, y_tv_common,
    test_size=0.1,
    stratify=y_tv_common,
    random_state=RANDOM_SEED
)

In [9]:
# Train
mask_tv_rare = ~mask_tv_common
X_tv_rare    = X_train_val[mask_tv_rare]
y_tv_rare    = y_train_val[mask_tv_rare]

X_train = np.concatenate([X_train_comm, X_tv_rare], axis=0)
y_train = np.concatenate([y_train_comm, y_tv_rare],   axis=0)

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

Train: 136558 | Val: 15168 | Test: 37921


# 3. Training

In [10]:
import shutil
import joblib

use_gpu = shutil.which("nvidia-smi") is not None
print("Using GPU" if use_gpu else "Using CPU")

params = {
    "objective": "multi:softprob",
    "num_class": len(le.classes_),
    "eval_metric": "mlogloss",
    "random_state": RANDOM_SEED,
    "tree_method": "hist",
    "device": "cuda" if use_gpu else "cpu",
    "n_jobs": -1,
    "max_depth": 0,
    "max_leaves": 96,
    'reg_alpha': 0.5,
    'reg_lambda': 1.0,
    'grow_policy': 'lossguide',
    "min_child_weight": 3,
    "gamma": 0.2,
    "lambda": 1.0,
    "alpha": 0.5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "colsample_bylevel": 0.8,
    "colsample_bynode": 0.8,
     "learning_rate": 0.05,
    "n_estimators": 20,
    "early_stopping_rounds": 200,
    # "sampling_method": "gradient_based"
}

clf = XGBClassifier(**params)
clf.fit(X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=True)
clf.save_model("output/xgb_model.json")
joblib.dump(clf, "output/xgb_model.pkl")
joblib.dump(le, "output/label_encoder.pkl")

Using CPU
Parameters: { "device" } are not used.

[0]	validation_0-mlogloss:5.60611
[1]	validation_0-mlogloss:4.90012
[2]	validation_0-mlogloss:4.43750
[3]	validation_0-mlogloss:4.09327
[4]	validation_0-mlogloss:3.82975
[5]	validation_0-mlogloss:3.60948
[6]	validation_0-mlogloss:3.41638
[7]	validation_0-mlogloss:3.24738
[8]	validation_0-mlogloss:3.10134
[9]	validation_0-mlogloss:2.96931
[10]	validation_0-mlogloss:2.85278
[11]	validation_0-mlogloss:2.74820
[12]	validation_0-mlogloss:2.65205
[13]	validation_0-mlogloss:2.56398
[14]	validation_0-mlogloss:2.48037
[15]	validation_0-mlogloss:2.40400
[16]	validation_0-mlogloss:2.33215
[17]	validation_0-mlogloss:2.26396
[18]	validation_0-mlogloss:2.19989
[19]	validation_0-mlogloss:2.13992


['output/label_encoder.pkl']

# 4. Evaluation

In [11]:
y_pred = clf.predict(X_test)
test_labels = np.unique(y_test)

print("Classification Report (Test Set)")
print(classification_report(
    y_test, y_pred,
    labels=test_labels,
    target_names=le.inverse_transform(test_labels),
    digits=3,
    zero_division=0
))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report (Test Set)
                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm      0.000     0.000     0.000         8
                                        abdominal hernia      0.951     0.736     0.830        53
                                         abscess of nose      1.000     0.433     0.605        30
                                     abscess of the lung      0.000     0.000     0.000         1
                                  abscess of the pharynx      0.846     0.667     0.746        33
                                    acanthosis nigricans      0.000     0.000     0.000         2
                                               acariasis      0.000     0.000     0.000         2
                                               achalasia      0.000     0.000     0.000         5
                                                    acne      0.685     0.607     0.

In [12]:
print("\nConfusion Matrix")
cm = confusion_matrix(y_test, y_pred, labels=test_labels)
cm_df = pd.DataFrame(
    cm,
    index=le.inverse_transform(test_labels),
    columns=le.inverse_transform(test_labels)
)
cm_df.to_csv("output/confusion_matrix.csv", index=True, encoding="utf-8-sig")


Confusion Matrix
