# XGBoost Model for Short MedSynth Dataset

In [1]:
import joblib
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from xgboost import XGBClassifier



In [2]:
df = pd.read_csv('../data/medsynth_short.csv')

df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,ICD_chapter
0,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hello, how are you doing today?\n\n[...",K5900,"CONSTIPATION, UNSPECIFIED",Diseases of the digestive system
1,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Good morning, how are you doing today...",K5900,"CONSTIPATION, UNSPECIFIED",Diseases of the digestive system
2,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hi, good morning. How are you today?...",K5900,"CONSTIPATION, UNSPECIFIED",Diseases of the digestive system
3,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Hi Mrs. Park, how are you today?\n\n[...",K5900,"CONSTIPATION, UNSPECIFIED",Diseases of the digestive system
4,#####\n**1. Subjective**\n\n**CHIEF COMPLAINT*...,"[doctor] Hi, how are you doing today? What bri...",K5900,"CONSTIPATION, UNSPECIFIED",Diseases of the digestive system
...,...,...,...,...,...
2265,### MEDICAL NOTE\n\n#### 1. Subjective:\n\n**C...,"[doctor]: Good afternoon, John. How are you do...",I519,"HEART DISEASE, UNSPECIFIED",Diseases of the circulatory system
2266,#####\n**1. Subjective:**\n\n**Chief Complaint...,"[doctor]: Good morning, how are you today?\n\n...",I519,"HEART DISEASE, UNSPECIFIED",Diseases of the circulatory system
2267,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Hello, it's good to see you today. I ...",I519,"HEART DISEASE, UNSPECIFIED",Diseases of the circulatory system
2268,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hello, Ms. Johnson. How are you toda...",I519,"HEART DISEASE, UNSPECIFIED",Diseases of the circulatory system


In [4]:
X = df["Dialogue"]

le = LabelEncoder()
y = le.fit_transform(df["ICD_chapter"].astype(str))
class_names = list(le.classes_)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,           # bump to 2+ for larger corpora; keep 1 for small samples
    max_df=0.9,
    strip_accents="unicode",
    lowercase=True,
    max_features=20000
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective='multi:softprob' if len(np.unique(y)) > 2 else 'binary:logistic',
    random_state=42,
    n_jobs=-1,
    tree_method="hist"  # 'gpu_hist' if you have GPU
)

In [4]:
pipe = Pipeline([
    ("tfidf", vectorizer),
    ("xgb", xgb)
])

pipe.fit(X_train, y_train)

joblib.dump(pipe, "ex1_xgboost_short.joblib")

['ex1_xgboost_short.joblib']

In [5]:
pipe = joblib.load("ex1_xgboost_short.joblib")

y_pred = pipe.predict(X_test)

print(classification_report(
    y_test, y_pred, target_names=class_names, zero_division=0
))

print("Macro F1:", f"{f1_score(y_test, y_pred, average='macro', zero_division=0):.3f}")

                                              precision    recall  f1-score   support

          Diseases of the circulatory system       0.96      0.93      0.95       179
            Diseases of the digestive system       0.94      0.96      0.95       159
          Diseases of the respiratory system       0.94      0.97      0.96       133
Diseases of the skin and subcutaneous tissue       0.99      0.97      0.98        97

                                    accuracy                           0.96       568
                                   macro avg       0.96      0.96      0.96       568
                                weighted avg       0.96      0.96      0.96       568

Macro F1: 0.959


In [6]:
from xaicompare.runner import publish_run
publish_run(
    model=pipe,
    X_test=X_test,                          # raw texts if pipeline contains TF-IDF
    y_test=y_test,                          # optional
    raw_text=X_test,                        # so the dashboard can show the note
    class_names=getattr(pipe.named_steps["xgb"], "classes_", None),
    run_dir="../../runs/ex1_xgboost_short2",
    config={"batch_size": 2,                # tiny batches to avoid OOM
            "rows_limit_global": 200,       # compute global on first 200 rows
            "rows_limit_local": 200}, # store local top-k for first 200 rows
)


Running XAI methods:   0%|          | 0/1 [00:00<?, ?it/s]

[shap_tree] Global importance:   0%|          | 0/1 [00:00<?, ?it/s]

[WARN] Feature mismatch: SHAP=19999, TFIDF=20000. Truncating both to 19999.


[shap_tree] Local explanations:   0%|          | 0/200 [00:00<?, ?it/s]