In [15]:
import json
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
patient_path = "mimic_ed_patient_encounter_stay.json"
triage_path  = "mimic_ed_triage_bundle.json"

In [17]:
# Load JSON files
with open(patient_path, "r") as f:
    patient_data = json.load(f)

with open(triage_path, "r") as f:
    triage_data = json.load(f)

print(patient_data.keys())
print(triage_data.keys())

dict_keys(['resourceType', 'type', 'entry'])
dict_keys(['resourceType', 'type', 'entry'])


In [18]:
# Step 1: Extract labels (disposition) from Encounter resources

enc_rows = []
for e in patient_data["entry"]:
    r = e["resource"]
    if r["resourceType"] != "Encounter":
        continue

    # Encounter ID (e.g., enc-37887480)
    enc_id = r["id"]

    # Extract patient ID (pat-xxxxx)
    subj_ref = r.get("subject", {}).get("reference")
    patient_id = subj_ref.split("/")[-1] if subj_ref else None

    # Extract encounter start and end time
    period = r.get("period", {})
    start = period.get("start")
    end = period.get("end")
    t1 = datetime.fromisoformat(start)
    t2 = datetime.fromisoformat(end)
    los = t2 - t1
    los = los.total_seconds() # Calculate los (unit-second)

    # Extract discharge disposition (ADMITTED / HOME / etc.)
    hosp = r.get("hospitalization", {})
    disp = (hosp.get("dischargeDisposition") or {}).get("text")

    enc_rows.append({
        "enc_id": enc_id,
        "patient_id": patient_id,
        "start": start,
        "end": end,
        "los": los,
        "disposition": disp
    })

enc_df = pd.DataFrame(enc_rows)
print("Number of Encounters:", len(enc_df))
print(enc_df.head())
print(enc_df["disposition"].value_counts())

enc_clean = enc_df.loc[enc_df['disposition'].isin(['ADMITTED','HOME'])]
print("Number of Encounters Admitted/Home:", len(enc_clean))

Number of Encounters: 222
         enc_id    patient_id                start                  end  \
0  enc-37887480  pat-10014729  2125-03-19T12:36:00  2125-03-19T16:59:47   
1  enc-34176810  pat-10018328  2154-02-05T17:09:00  2154-02-05T22:54:00   
2  enc-32103106  pat-10018328  2154-08-03T15:31:00  2154-08-03T22:29:00   
3  enc-38797992  pat-10020640  2153-02-12T21:59:00  2153-02-13T01:38:00   
4  enc-33473053  pat-10015272  2137-06-12T16:54:00  2137-06-12T18:37:22   

       los disposition  
0  15827.0    ADMITTED  
1  20700.0    ADMITTED  
2  25080.0        HOME  
3  13140.0    ADMITTED  
4   6202.0    ADMITTED  
disposition
ADMITTED                       150
HOME                            60
TRANSFER                         5
OTHER                            2
LEFT AGAINST MEDICAL ADVICE      2
LEFT WITHOUT BEING SEEN          2
ELOPED                           1
Name: count, dtype: int64
Number of Encounters Admitted/Home: 210


In [19]:
# Step 2: Extract vital signs and triage features from Observation data

obs_rows = []
for e in triage_data["entry"]:
    r = e["resource"]
    if r["resourceType"] != "Observation":
        continue

    # Encounter ID linked to Observation
    enc_ref = r.get("encounter", {}).get("reference")
    enc_id = enc_ref.split("/")[-1] if enc_ref else None

    # LOINC code for the vital sign / triage variable
    code_list = r.get("code", {}).get("coding", [])
    code = code_list[0]["code"] if code_list else None
    code_readable = code_list[0]["display"] if code_list else None # Readable code

    # Numeric value of the observation
    val = (r.get("valueQuantity") or {}).get("value")

    obs_rows.append({
        "enc_id": enc_id,
        "code": code,
        "code_r": code_readable,
        "value": val
    })

obs_df = pd.DataFrame(obs_rows)
print(obs_df.head())
print("LOINC code counts:")
print(obs_df["code"].value_counts())

         enc_id       code       code_r  value
0  enc-34558830  LA30139-1    ED Acuity    2.0
1  enc-34820898  LA30139-1    ED Acuity    2.0
2  enc-35758326     8310-5  Temperature   98.8
3  enc-35758326     8867-4   Heart rate   72.0
4  enc-35758326     9279-1    Resp rate   18.0
LOINC code counts:
code
LA30139-1    207
9279-1       199
8462-4       199
8480-6       199
59408-5      198
8867-4       198
8310-5       196
72514-3      189
Name: count, dtype: int64


In [20]:
# Mapping from LOINC code to variable name
code_map = {
    "8310-5": "temperature",
    "8867-4": "heartrate",
    "9279-1": "resprate",
    "59408-5": "o2sat",
    "8480-6": "sbp",
    "8462-4": "dbp",
    "72514-3": "pain",
    "LA30139-1": "acuity"
}
obs_df["var"] = obs_df["code"].map(code_map)
obs_clean = obs_df.dropna(subset=["var"]).copy()

obs_pivot = (
    obs_clean
    .pivot_table(
        index="enc_id",
        columns="var",
        values="value",
        aggfunc="first"  # take first value if multiple entries exist
    )
    .reset_index()
)

# Remove entries with Na in any predictors
obs_pivot_clean = obs_pivot.dropna()

print(obs_df.head())
print("Encounters with at least one triage vital sign:", len(obs_pivot))
print("Encounters with all triage vital signs:", len(obs_pivot_clean))

         enc_id       code       code_r  value          var
0  enc-34558830  LA30139-1    ED Acuity    2.0       acuity
1  enc-34820898  LA30139-1    ED Acuity    2.0       acuity
2  enc-35758326     8310-5  Temperature   98.8  temperature
3  enc-35758326     8867-4   Heart rate   72.0    heartrate
4  enc-35758326     9279-1    Resp rate   18.0     resprate
Encounters with at least one triage vital sign: 207
Encounters with all triage vital signs: 183


In [57]:
# Step 3: Extract chief complaint from Condition resources

cond_rows = []
for e in triage_data["entry"]:
    r = e["resource"]
    if r["resourceType"] != "Condition":
        continue

    enc_ref = r.get("encounter", {}).get("reference")
    enc_id = enc_ref.split("/")[-1] if enc_ref else None

    text = (r.get("code") or {}).get("text")

    cond_rows.append({
        "enc_id": enc_id,
        "chiefcomplaint": text
    })

cond_df = pd.DataFrame(cond_rows)

# Feature extraction
cond_df["chiefcomplaint_clean"] = (
    cond_df["chiefcomplaint"]
        .str.lower()
        .str.replace(",", " ")
        .str.replace("!", " ")
        .str.replace("?", " ")
        .str.strip()
)
cond_df["chiefcomplaint_clean"] = cond_df["chiefcomplaint_clean"].str.split().str.join(" ")

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(cond_df["chiefcomplaint_clean"])
X = pd.DataFrame(X.toarray())
X.columns = 'cc_' + vectorizer.get_feature_names_out()
cols_keep = (X > 0).mean() >= .05 # Ensuring generalizable complaints
X_clean = X.loc[:, cols_keep]
X_clean.loc[:, 'enc_id'] = cond_df['enc_id'].values

print(X_clean.head())

   cc_abd  cc_chest  cc_dyspnea  cc_fall  cc_pain  cc_transfer  cc_weakness  \
0       0         0           0        0        0            0            0   
1       0         0           0        0        0            0            0   
2       0         0           0        0        0            0            0   
3       0         0           0        0        0            0            0   
4       0         0           0        0        0            0            0   

         enc_id  
0  enc-33211001  
1  enc-30701739  
2  enc-30115213  
3  enc-38926302  
4  enc-36678718  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_clean.loc[:, 'enc_id'] = cond_df['enc_id'].values


In [58]:
df = enc_clean.merge(
    obs_pivot_clean,
    left_on="enc_id",
    right_on="enc_id",
    how="inner"
)

df = df.merge(
    X_clean,
    left_on="enc_id",
    right_on="enc_id",
    how="inner"
)

In [59]:
df

Unnamed: 0,enc_id,patient_id,start,end,los,disposition,acuity,dbp,heartrate,o2sat,...,resprate,sbp,temperature,cc_abd,cc_chest,cc_dyspnea,cc_fall,cc_pain,cc_transfer,cc_weakness
0,enc-34176810,pat-10018328,2154-02-05T17:09:00,2154-02-05T22:54:00,20700.0,ADMITTED,2.0,65.0,74.0,96.0,...,20.0,133.0,97.7,0,0,0,0,0,1,0
1,enc-32103106,pat-10018328,2154-08-03T15:31:00,2154-08-03T22:29:00,25080.0,HOME,2.0,75.0,74.0,100.0,...,18.0,142.0,96.2,0,0,0,0,0,0,0
2,enc-33473053,pat-10015272,2137-06-12T16:54:00,2137-06-12T18:37:22,6202.0,ADMITTED,2.0,56.0,118.0,96.0,...,18.0,100.0,97.5,0,0,0,0,0,0,0
3,enc-35758326,pat-10016810,2185-06-15T23:08:00,2185-06-16T02:16:00,11280.0,ADMITTED,2.0,48.0,72.0,90.0,...,18.0,98.0,98.8,1,0,0,0,1,0,0
4,enc-32259573,pat-10006580,2137-09-29T21:06:00,2137-09-29T22:54:00,6480.0,HOME,3.0,90.0,89.0,96.0,...,18.0,131.0,98.1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,enc-39898790,pat-10023117,2174-06-07T16:24:00,2174-06-08T01:02:00,31080.0,ADMITTED,2.0,58.0,79.0,99.0,...,20.0,100.0,97.7,0,0,0,0,0,0,1
169,enc-31283645,pat-10023117,2171-11-07T17:40:00,2171-11-07T22:50:00,18600.0,ADMITTED,3.0,77.0,109.0,100.0,...,22.0,148.0,98.0,0,0,0,0,0,0,0
170,enc-31628990,pat-10009049,2174-05-26T04:20:00,2174-05-26T09:18:00,17880.0,ADMITTED,3.0,68.0,87.0,96.0,...,18.0,126.0,99.0,0,1,0,0,1,0,0
171,enc-32405286,pat-10004457,2141-08-12T12:08:00,2141-08-12T17:20:00,18720.0,ADMITTED,2.0,72.0,103.0,98.0,...,16.0,138.0,97.6,0,0,0,0,0,0,0


In [60]:
df.to_csv('mimic_ed_clean_combined.csv',index=False)