# XGBoost with Tabular and Text info

In [17]:
import re
import pandas as pd
import numpy as np

In [22]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os
import sys
sys.path.insert(0, os.path.abspath('..'))

from helpers import ICD10_CHAPTERS

df = pd.read_csv("MedSynth_huggingface_final.csv")
def map_icd10_to_chapter(code):
    """Return official ICD-10 chapter name from any ICD-10 code."""
    if pd.isna(code):
        return None
    code = str(code).strip()
    first_letter = code[0].upper()

    if first_letter in ICD10_CHAPTERS:
        return ICD10_CHAPTERS[first_letter][2]  # return chapter name
    return "Unknown"

df = df.dropna(subset=["Dialogue"]).copy()

df["Dialogue"].apply(type).value_counts()


df["ICD_chapter"] = df["ICD10"].apply(map_icd10_to_chapter)
df["ICD_chapter"].value_counts()

df = df.rename(columns={" Note": "Note"})

Extract the vital signs information from the Note column


In [23]:

bp = df["Note"].str.extract(
    r'Blood\s*Pressure\s*[:\-]?\s*(?P<bp_sys>\d{2,3})\s*[/\-]\s*(?P<bp_dia>\d{2,3})\s*(?:mm\s*Hg|mmHg)?',
    flags=re.I
)

hr = df["Note"].str.extract(
    r'(?:Heart\s*Rate|HR)\s*[:\-]?\s*(?P<hr>\d{1,3})\s*(?:bpm|/min)?',
    flags=re.I
)

rr = df["Note"].str.extract(
    r'(?:Respiratory\s*Rate|RR)\s*[:\-]?\s*(?P<rr>\d{1,3})\s*(?:breaths?/min|rpm|/min)?',
    flags=re.I
)

temp = df["Note"].str.extract(
    r'Temperature\s*[:\-]?\s*(?P<temp>\d{2,3}(?:\.\d+)?)\s*°?\s*(?P<temp_unit>[FC]|(?:Fahrenheit|Celsius))',
    flags=re.I
)

spo2 = df["Note"].str.extract(
    r'(?:Oxygen\s*Saturation|SpO2)\s*[:\-]?\s*(?P<spo2>\d{2,3})\s*%(\s*(?:on|via)\s*(?P<o2_device>[^;\n]+))?',
    flags=re.I
)

# --- Combine and clean types ---
vitals = pd.concat([bp, hr, rr, temp, spo2], axis=1)

# Convert to numeric
for col in ["bp_sys", "bp_dia", "hr", "rr", "temp", "spo2"]:
    if col in vitals:
        vitals[col] = pd.to_numeric(vitals[col], errors="coerce")

# Normalize temperature units; create both °C and °F
def _norm_unit(u):
    if u is None or (isinstance(u, float) and np.isnan(u)):
        return np.nan
    u = str(u).strip().lower()
    return "f" if u.startswith("f") else ("c" if u.startswith("c") else np.nan)

vitals["temp_unit"] = vitals["temp_unit"].map(_norm_unit)

vitals["temp_c"] = np.where(
    vitals["temp_unit"].str.lower().eq("f"),
    (vitals["temp"] - 32) * 5 / 9,
    vitals["temp"]
)
vitals["temp_f"] = np.where(
    vitals["temp_unit"].str.lower().eq("c"),
    vitals["temp"] * 9 / 5 + 32,
    vitals["temp"]
)

# Clean oxygen device text (e.g., "room air", "nasal cannula 2 L/min")
if "o2_device" in vitals:
    vitals["o2_device"] = vitals["o2_device"].str.strip().str.rstrip(".")

# Join back to your DataFrame
df = df.join(vitals)

# Optional: rename columns to your preferred schema
df = df.rename(columns={
    "bp_sys": "BP_systolic",
    "bp_dia": "BP_diastolic",
    "hr": "Heart_Rate",
    "rr": "Respiratory_Rate",
    "spo2": "Oxygen_Saturation",
    "o2_device": "Oxygen_Device"
})


In [24]:
df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,ICD_chapter,BP_systolic,BP_diastolic,Heart_Rate,Respiratory_Rate,temp,temp_unit,Oxygen_Saturation,1,Oxygen_Device,temp_c,temp_f
0,**1. Subjective:**\n\n **Chief Complaint (CC...,[doctor]: Hello! It’s good to see you today. H...,M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,128.0,82.0,72.0,16.0,98.6,f,,,,37.000000,98.6
1,**1. Subjective:**\n\n - **Chief Complaint (...,"[doctor] Hi there, how are you today?\n\n[pati...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,,,,,,,,,,,
2,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Good morning, how are you doing today...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,140.0,85.0,,16.0,98.6,f,,,,37.000000,98.6
3,**1. Subjective:**\n\n**Chief Complaint (CC):*...,[doctor] Good morning! How are you feeling tod...,M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,130.0,80.0,72.0,16.0,98.6,f,,,,37.000000,98.6
4,#####\n**1. Subjective:**\n\n**Chief Complaint...,"[doctor]: Hello Mr. Doe, how are you doing tod...",M25562,PAIN IN LEFT KNEE,Diseases of musculoskeletal system,130.0,80.0,72.0,16.0,98.6,f,,,,37.000000,98.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,#####\n**1. Subjective:**\n \n**Chief Compla...,[doctor]: Good morning. How are you doing toda...,B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,120.0,80.0,82.0,18.0,98.6,f,,,,37.000000,98.6
10236,### Gastroenterologist Medical Note\n\n#### 1....,"**Doctor:** Hi there, how are you doing today?...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,130.0,85.0,72.0,,98.6,f,,,,37.000000,98.6
10237,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hi Mr. Harris, how are you doing tod...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,122.0,78.0,88.0,18.0,98.6,f,98.0,on room air,room air,37.000000,98.6
10238,#####\n**1. Subjective:**\n**Chief Complaint (...,"[doctor]: Good morning, Ms. Lee. How are you d...",B3781,CANDIDAL ESOPHAGITIS,Certain infectious and parasitic diseases,120.0,80.0,72.0,,98.6,f,,,,37.000000,98.6


In [25]:
df.to_csv("medsynth_tab_text.csv", index=False)