# XGBoost with Tabular and Text info

## Overview

- Uses full MedSynth dataset, with

## Imports and basic dataset setup/processing

In [1]:
import joblib
import pandas as pd
import numpy as np
import helpers
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

df_ms = helpers.process_medsynth()

✅ File already exists: data/MedSynth_huggingface_final.csv


## Extract the vital signs information from the Note column


In [2]:
df = helpers.extract_vitalsigns_tocols(df_ms)
df


Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,ICD_chapter,BP_systolic,BP_diastolic,Heart_Rate,Respiratory_Rate,temp,temp_unit,Oxygen_Saturation,Oxygen_Device,temp_c,temp_f
0,**1. Subjective:**\n\n **Chief Complaint (CC...,[doctor]: Hello! It’s good to see you today. H...,M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,128.0,82.0,72.0,16.0,98.6,f,,,37.000000,98.6
1,**1. Subjective:**\n\n - **Chief Complaint (...,"[doctor] Hi there, how are you today?\n\n[pati...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,,,,,,,,,,
2,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Good morning, how are you doing today...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,140.0,85.0,,16.0,98.6,f,,,37.000000,98.6
3,**1. Subjective:**\n\n**Chief Complaint (CC):*...,[doctor] Good morning! How are you feeling tod...,M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,130.0,80.0,72.0,16.0,98.6,f,,,37.000000,98.6
4,#####\n**1. Subjective:**\n\n**Chief Complaint...,"[doctor]: Hello Mr. Doe, how are you doing tod...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,130.0,80.0,72.0,16.0,98.6,f,,,37.000000,98.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,#####\n**1. Subjective:**\n \n**Chief Compla...,[doctor]: Good morning. How are you doing toda...,B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,120.0,80.0,82.0,18.0,98.6,f,,,37.000000,98.6
10236,### Gastroenterologist Medical Note\n\n#### 1....,"**Doctor:** Hi there, how are you doing today?...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,130.0,85.0,72.0,,98.6,f,,,37.000000,98.6
10237,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hi Mr. Harris, how are you doing tod...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,122.0,78.0,88.0,18.0,98.6,f,98.0,room air,37.000000,98.6
10238,#####\n**1. Subjective:**\n**Chief Complaint (...,"[doctor]: Good morning, Ms. Lee. How are you d...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,120.0,80.0,72.0,,98.6,f,,,37.000000,98.6


## Fix NaNs

In [3]:

for col in ["BP_systolic", "BP_diastolic", "Heart_Rate",
            "Respiratory_Rate", "temp_c", "Oxygen_Saturation"]:
    df[col + "_missing"] = df[col].isna().astype(int)

# categorical imputation (simple)
df["Oxygen_Device"] = df["Oxygen_Device"].fillna("Unknown")

# encode categorical
df = pd.get_dummies(df, columns=["Oxygen_Device"], dummy_na=False)


In [4]:
df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,ICD_chapter,BP_systolic,BP_diastolic,Heart_Rate,Respiratory_Rate,temp,...,Oxygen_Device_room air,Oxygen_Device_room air (self-reported),"Oxygen_Device_room air, 95% on 2 L/min supplemental oxygen","Oxygen_Device_room air, Temp 101.5°F","Oxygen_Device_room air, Temp 36.8°C, RR 18 breaths/min","Oxygen_Device_room air, Temp 98.6°F","Oxygen_Device_room air, Temp: 98.7°F","Oxygen_Device_room air, Temperature: 98.6°F","Oxygen_Device_room air, improved to 92% on mechanical ventilation","Oxygen_Device_room air, temperature 98.6°F (37°C)"
0,**1. Subjective:**\n\n **Chief Complaint (CC...,[doctor]: Hello! It’s good to see you today. H...,M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,128.0,82.0,72.0,16.0,98.6,...,False,False,False,False,False,False,False,False,False,False
1,**1. Subjective:**\n\n - **Chief Complaint (...,"[doctor] Hi there, how are you today?\n\n[pati...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,,,,,,...,False,False,False,False,False,False,False,False,False,False
2,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Good morning, how are you doing today...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,140.0,85.0,,16.0,98.6,...,False,False,False,False,False,False,False,False,False,False
3,**1. Subjective:**\n\n**Chief Complaint (CC):*...,[doctor] Good morning! How are you feeling tod...,M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,130.0,80.0,72.0,16.0,98.6,...,False,False,False,False,False,False,False,False,False,False
4,#####\n**1. Subjective:**\n\n**Chief Complaint...,"[doctor]: Hello Mr. Doe, how are you doing tod...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,130.0,80.0,72.0,16.0,98.6,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,#####\n**1. Subjective:**\n \n**Chief Compla...,[doctor]: Good morning. How are you doing toda...,B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,120.0,80.0,82.0,18.0,98.6,...,False,False,False,False,False,False,False,False,False,False
10236,### Gastroenterologist Medical Note\n\n#### 1....,"**Doctor:** Hi there, how are you doing today?...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,130.0,85.0,72.0,,98.6,...,False,False,False,False,False,False,False,False,False,False
10237,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hi Mr. Harris, how are you doing tod...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,122.0,78.0,88.0,18.0,98.6,...,True,False,False,False,False,False,False,False,False,False
10238,#####\n**1. Subjective:**\n**Chief Complaint (...,"[doctor]: Good morning, Ms. Lee. How are you d...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,120.0,80.0,72.0,,98.6,...,False,False,False,False,False,False,False,False,False,False


In [5]:
X = df.drop(["Note","ICD10","ICD10_desc","ICD_chapter", "temp", "temp_unit"], axis=1)

print(X.shape)


(10238, 32)


In [6]:


le = LabelEncoder()
y = le.fit_transform(df["ICD_chapter"].astype(str))
class_names = list(le.classes_)
print(y.shape)


(10238,)


In [8]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

print("Columns in X:", X.columns)

# Debug: Check if 'Dialogue' is in columns
if "Dialogue" not in X.columns:
    raise ValueError("'Dialogue' column not found in X!")

other_cols = [c for c in X.columns if c != "Dialogue"]
print(X[other_cols].dtypes)

preprocess = ColumnTransformer(
    transformers=[
        ("dialogue_tfidf",
         TfidfVectorizer(
             ngram_range=(1, 2),
             min_df=2,
             max_df=0.9,
             strip_accents="unicode",
             lowercase=True,
             max_features=20000
         ),
         "Dialogue"),
        ("other", "passthrough", other_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3,  # keeps output sparse (efficient with TF-IDF)
)


X_transformed = preprocess.fit_transform(X_train)
print("Transformed shape:", X_transformed.shape)

# Debug: Check for NaN in 'Dialogue'
print("NaN in Dialogue:", X_train["Dialogue"].isna().sum())

X_train: (7678, 32)
y_train: (7678,)
X_test: (2560, 32)
y_test: (2560,)
Columns in X: Index(['Dialogue', 'BP_systolic', 'BP_diastolic', 'Heart_Rate',
       'Respiratory_Rate', 'Oxygen_Saturation', 'temp_c', 'temp_f',
       'BP_systolic_missing', 'BP_diastolic_missing', 'Heart_Rate_missing',
       'Respiratory_Rate_missing', 'temp_c_missing',
       'Oxygen_Saturation_missing',
       'Oxygen_Device_2 L/min of supplemental oxygen',
       'Oxygen_Device_2 L/min oxygen via nasal cannula',
       'Oxygen_Device_2 liters of oxygen via nasal cannula',
       'Oxygen_Device_2L O2 via nasal cannula', 'Oxygen_Device_Unknown',
       'Oxygen_Device_mechanical ventilation',
       'Oxygen_Device_mechanical ventilation with FiO2 35%',
       'Oxygen_Device_nasal cannula', 'Oxygen_Device_room air',
       'Oxygen_Device_room air (self-reported)',
       'Oxygen_Device_room air, 95% on 2 L/min supplemental oxygen',
       'Oxygen_Device_room air, Temp 101.5°F',
       'Oxygen_Device_room air, Te

In [9]:




# 5) Model
objective = "multi:softprob" if len(np.unique(y)) > 2 else "binary:logistic"
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective=objective,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",  # 'gpu_hist' if you have GPU
)

# 6) Pipeline
pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", xgb),
])


In [None]:


pipe.fit(X_train, y_train)

joblib.dump(pipe, "./models/ex2_xgboost.joblib")