# XGBoost Model for MedSynth Dataset

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('../../data/medsynth/MedSynth_huggingface_final.csv')

df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc
0,**1. Subjective:**\n\n **Chief Complaint (CC...,[doctor]: Hello! It’s good to see you today. H...,M25562,PAIN IN LEFT KNEE
1,**1. Subjective:**\n\n - **Chief Complaint (...,"[doctor] Hi there, how are you today?\n\n[pati...",M25562,PAIN IN LEFT KNEE
2,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Good morning, how are you doing today...",M25562,PAIN IN LEFT KNEE
3,**1. Subjective:**\n\n**Chief Complaint (CC):*...,[doctor] Good morning! How are you feeling tod...,M25562,PAIN IN LEFT KNEE
4,#####\n**1. Subjective:**\n\n**Chief Complaint...,"[doctor]: Hello Mr. Doe, how are you doing tod...",M25562,PAIN IN LEFT KNEE
...,...,...,...,...
10235,#####\n**1. Subjective:**\n \n**Chief Compla...,[doctor]: Good morning. How are you doing toda...,B3781,CANDIDAL ESOPHAGITIS
10236,### Gastroenterologist Medical Note\n\n#### 1....,"**Doctor:** Hi there, how are you doing today?...",B3781,CANDIDAL ESOPHAGITIS
10237,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hi Mr. Harris, how are you doing tod...",B3781,CANDIDAL ESOPHAGITIS
10238,#####\n**1. Subjective:**\n**Chief Complaint (...,"[doctor]: Good morning, Ms. Lee. How are you d...",B3781,CANDIDAL ESOPHAGITIS


In [3]:

ICD10_CHAPTERS = {
    "A": ("A00", "B99", "Certain infectious and parasitic diseases"),
    "B": ("A00", "B99", "Certain infectious and parasitic diseases"),
    "C": ("C00", "D49", "Neoplasms"),
    "D": ("C00", "D49", "Neoplasms"),  # D00–D49 = neoplasms
    "E": ("E00", "E89", "Endocrine, nutritional and metabolic diseases"),
    "F": ("F01", "F99", "Mental, behavioral and neurodevelopmental disorders"),
    "G": ("G00", "G99", "Diseases of the nervous system"),
    "H": ("H00", "H95", "Diseases of eye/ear/adnexa/mastoid"),  # H00–H95 split but ok as one area
    "I": ("I00", "I99", "Diseases of the circulatory system"),
    "J": ("J00", "J99", "Diseases of the respiratory system"),
    "K": ("K00", "K95", "Diseases of the digestive system"),
    "L": ("L00", "L99", "Diseases of the skin and subcutaneous tissue"),
    "M": ("M00", "M99", "Diseases of musculoskeletal system"),
    "N": ("N00", "N99", "Diseases of the genitourinary system"),
    "O": ("O00", "O9A", "Pregnancy, childbirth and puerperium"),
    "P": ("P00", "P96", "Perinatal conditions"),
    "Q": ("Q00", "Q99", "Congenital malformations"),
    "R": ("R00", "R99", "Symptoms, signs, abnormal findings"),
    "S": ("S00", "T88", "Injury and poisoning"),
    "T": ("S00", "T88", "Injury and poisoning"),
    "V": ("V00", "Y99", "External causes of morbidity"),
    "W": ("V00", "Y99", "External causes of morbidity"),
    "X": ("V00", "Y99", "External causes of morbidity"),
    "Y": ("V00", "Y99", "External causes of morbidity"),
    "Z": ("Z00", "Z99", "Factors influencing health status"),
    "U": ("U00", "U85", "Special purposes")
}

def map_icd10_to_chapter(code):
    """Return official ICD-10 chapter name from any ICD-10 code."""
    if pd.isna(code):
        return None
    code = str(code).strip()
    first_letter = code[0].upper()

    if first_letter in ICD10_CHAPTERS:
        return ICD10_CHAPTERS[first_letter][2]  # return chapter name
    return "Unknown"


In [4]:

df = df.dropna(subset=["Dialogue"]).copy()

df["Dialogue"].apply(type).value_counts()


df["ICD_chapter"] = df["ICD10"].apply(map_icd10_to_chapter)
df["ICD_chapter"].value_counts()


ICD_chapter
Factors influencing health status                      1394
Diseases of musculoskeletal system                     1155
Symptoms, signs, abnormal findings                     1110
Neoplasms                                               900
Diseases of the circulatory system                      715
Diseases of the digestive system                        635
Diseases of the genitourinary system                    570
Injury and poisoning                                    535
Diseases of the respiratory system                      530
Mental, behavioral and neurodevelopmental disorders     500
Diseases of eye/ear/adnexa/mastoid                      435
Endocrine, nutritional and metabolic diseases           405
Diseases of the skin and subcutaneous tissue            390
Diseases of the nervous system                          355
Pregnancy, childbirth and puerperium                    324
Certain infectious and parasitic diseases               225
Perinatal conditions        

In [5]:
X = df["Dialogue"]

le = LabelEncoder()
y = le.fit_transform(df["ICD_chapter"].astype(str))
class_names = list(le.classes_)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)



In [6]:
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,           # bump to 2+ for larger corpora; keep 1 for small samples
    max_df=0.9,
    strip_accents="unicode",
    lowercase=True
)

In [7]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective='multi:softprob' if len(np.unique(y)) > 2 else 'binary:logistic',
    random_state=42,
    n_jobs=-1,
    tree_method="hist"  # 'gpu_hist' if you have GPU
)

In [8]:
pipe = Pipeline([
    ("tfidf", vectorizer),
    ("xgb", xgb)
])

pipe.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('tfidf', ...), ('xgb', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",'unicode'
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.9
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [9]:
import joblib

joblib.dump(pipe, "tfidf_xgb_pipeline.joblib")


['tfidf_xgb_pipeline.joblib']

In [None]:

import joblib

pipe = joblib.load("tfidf_xgb_pipeline.joblib")


In [14]:

from sklearn.metrics import classification_report, f1_score

y_pred = pipe.predict(X_test)

print(classification_report(
    y_test, y_pred, target_names=class_names, zero_division=0
))

print("Macro F1:", f"{f1_score(y_test, y_pred, average='macro', zero_division=0):.3f}")


                                                     precision    recall  f1-score   support

          Certain infectious and parasitic diseases       0.71      0.36      0.48        56
                           Congenital malformations       1.00      0.33      0.50         6
                 Diseases of eye/ear/adnexa/mastoid       0.94      0.93      0.93       109
                 Diseases of musculoskeletal system       0.77      0.88      0.82       289
                 Diseases of the circulatory system       0.76      0.83      0.79       179
                   Diseases of the digestive system       0.71      0.80      0.75       159
               Diseases of the genitourinary system       0.71      0.80      0.75       143
                     Diseases of the nervous system       0.60      0.43      0.50        89
                 Diseases of the respiratory system       0.69      0.77      0.73       132
       Diseases of the skin and subcutaneous tissue       0.85      0

In [12]:
tfidf_fitted = pipe.named_steps["tfidf"]
xgb_fitted   = pipe.named_steps["xgb"]

X_train_tfidf = tfidf_fitted.transform(X_train)
X_test_tfidf  = tfidf_fitted.transform(X_test)

In [18]:

import numpy as np
import shap

# 1) Pull fitted components
tfidf = pipe.named_steps["tfidf"]
xgb   = pipe.named_steps["xgb"]

X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)
feature_names = tfidf.get_feature_names_out()
n_features_vec = len(feature_names)

# 2) Build sparse-friendly explainer (no background, avoid dense conversions)
explainer = shap.TreeExplainer(xgb, feature_perturbation="tree_path_dependent")

# 3) Helper: normalize SHAP output to (batch, n_features_model) of |SHAP| aggregated over classes
def normalize_shap_abs(sv):
    """
    sv can be:
      - list of (batch, n_features+1) arrays (one per class)
      - 2-D array (batch, n_features+1)
      - 3-D array (batch, n_features+1, n_classes)
    Returns:
      abs_sv_2d: (batch, n_features_model)  [bias dropped, classes aggregated by sum of abs]
      n_features_model: int
    """
    if isinstance(sv, list):
        # Multi-class: list of arrays (batch, n_features+1)
        # Drop bias and aggregate abs across classes
        parts = []
        for svc in sv:
            # svc: (batch, n_features+1)
            parts.append(np.abs(svc[:, :-1]))  # drop bias
        abs_sv_2d = np.sum(parts, axis=0)      # (batch, n_features)
        n_features_model = abs_sv_2d.shape[1]
        return abs_sv_2d, n_features_model

    sv = np.asarray(sv)
    if sv.ndim == 2:
        # Binary/regression: (batch, n_features+1)
        abs_sv_2d = np.abs(sv[:, :-1])         # drop bias
        n_features_model = abs_sv_2d.shape[1]
        return abs_sv_2d, n_features_model

    if sv.ndim == 3:
        # Multi-class (single array): (batch, n_features+1, n_classes)
        abs_no_bias = np.abs(sv[:, :-1, :])    # (batch, n_features, n_classes)
        abs_sv_2d = abs_no_bias.sum(axis=2)    # aggregate classes -> (batch, n_features)
        n_features_model = abs_sv_2d.shape[1]
        return abs_sv_2d, n_features_model

    raise ValueError(f"Unexpected SHAP output shape: {sv.shape}")

# 4) Batch iterator
def iter_batches(X, batch_size=2):
    n = X.shape[0]
    for start in range(0, n, batch_size):
        end = min(n, start + batch_size)
        yield X[start:end], (start, end)

# 5) Streamed global |SHAP| mean (memory-safe)
global_abs_sum = None
n_rows_accum = 0
rows_limit_for_global = min(200, X_test_tfidf.shape[0])  # adjust as needed (more rows = smoother)

processed = 0
for Xb, (s, e) in iter_batches(X_test_tfidf[:rows_limit_for_global], batch_size=2):
    # SAFE: small batch, sparse input
    sv = explainer.shap_values(Xb, check_additivity=False, approximate=True)

    abs_sv_2d, n_features_model = normalize_shap_abs(sv)

    # Initialize accumulator on first batch, using model-derived feature count
    if global_abs_sum is None:
        global_abs_sum = np.zeros(n_features_model, dtype=np.float64)

        # Align feature names with model feature count if there is a mismatch
        if n_features_model != n_features_vec:
            print(f"[WARN] TF-IDF features = {n_features_vec}, model SHAP features = {n_features_model}. "
                  f"Aligning to min(...) to avoid shape issues.")
        # Compute alignment range
        align = min(n_features_model, n_features_vec)
        feature_slice = slice(0, align)

    # Accumulate with alignment (drop any trailing extra columns if mismatched)
    batch_sum = abs_sv_2d.sum(axis=0)  # (n_features_model,)
    global_abs_sum[feature_slice] += batch_sum[feature_slice]
    n_rows_accum += abs_sv_2d.shape[0]
    processed += abs_sv_2d.shape[0]
    print(processed)

print(f"Global SHAP processed rows: {processed}")




[WARN] TF-IDF features = 146937, model SHAP features = 146936. Aligning to min(...) to avoid shape issues.
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126
128
130
132
134
136
138
140
142
144
146
148
150
152
154
156
158
160
162
164
166
168
170
172
174
176
178
180
182
184
186
188
190
192
194
196
198
200
Global SHAP processed rows: 200


In [20]:

# 6) Compute mean |SHAP| and get top-N global tokens
if n_rows_accum > 0:
    mean_abs = global_abs_sum / n_rows_accum
    # If we aligned, trim feature_names too
    if len(mean_abs) != len(feature_names):
        align = min(len(mean_abs), len(feature_names))
        mean_abs = mean_abs[:align]
        feature_names = feature_names[:align]

    top_n = 50
    top_idx = np.argsort(mean_abs)[-top_n:][::-1]
    print("\nTop global tokens by mean |SHAP|:")
    for tok, score in zip(feature_names[top_idx], mean_abs[top_idx]):
        print(f"{tok:30s} {score:.6f}")



Top global tokens by mean |SHAP|:
the pain                       2.197726
pain                           2.070043
infection                      1.742091
fetal                          1.332738
baby                           1.229038
months                         1.168629
skin                           1.090589
pregnancy                      1.020281
eye                            0.909348
abdomen                        0.905463
pressure                       0.877501
levels                         0.857622
echocardiogram                 0.777601
nasal                          0.776819
covid                          0.745267
vision                         0.740574
ear                            0.739019
how are                        0.738897
heart                          0.702486
pelvic                         0.667174
neoplasm                       0.605659
days                           0.587768
chest                          0.576048
murmur                         0.551081
apply

In [6]:
from expailens.runner import publish_run
import joblib

pipe = joblib.load("tfidf_xgb_pipeline.joblib")

publish_run(
    model=pipe,
    X_test=X_test,                          # raw texts if pipeline contains TF-IDF
    y_test=y_test,                          # optional
    raw_text=X_test,                        # so the dashboard can show the note
    class_names=getattr(pipe.named_steps["xgb"], "classes_", None),
    run_dir="runs/2026-02-18_chapters_xgb",
    config={"batch_size": 2,                # tiny batches to avoid OOM
            "rows_limit_global": 200,       # compute global on first 200 rows
            "rows_limit_local": 200}        # store local top-k for first 200 rows
)


TypeError: keys must be str, int, float, bool or None, not numpy.int64