In [2]:
import pandas as pd

icd_with_summary = pd.read_parquet('../data/processed/structured_dataset_with_discharge_summaries.train.parquet')

icd_with_summary.head()

Unnamed: 0,subject_id_x,hadm_id,admittime,dischtime,admission_type,discharge_location,gender,dod,age_at_admit,length_of_stay_days,...,num_procedures_total,meds_discharge_like,medication_count,subject_id_y,policy,discharge_summary,diagnosis_codes,model,true_icd_codes,missing_codes
0,17504528,20171885,2137-06-17 20:40:00,2137-06-20 16:41:00,DIRECT EMER.,HOME,F,2142-07-24,56,2.834028,...,1.0,"[5% Dextrose, Albuterol 0.083% Neb Soln, Aspir...",24.0,17504528,partial,History of Present Illness: 56-year-old female...,"[I10, Z8572, Z5181, I5032, I2510, K219, I071, ...",gpt-5-mini,"[I5032, E118, I10, E785, I2510, I071, D649, I2...","[I480, D473, Z8673, Z7901, E785, E118, Z954, F..."
1,14273001,20371042,2187-10-19 19:00:00,2187-10-21 18:13:00,OBSERVATION ADMIT,HOME,F,2188-03-27,73,1.967361,...,,"[Acetaminophen, Aspirin, Atorvastatin, Bisacod...",19.0,14273001,all,History of Present Illness: 73-year-old female...,"[I10, Z87891, C3431, I5031, Y929, T380X5A, K21...",gpt-5-mini,"[I5031, I82501, C7931, C3431, E039, Z7901, I10...",[]
2,11357031,27612249,2139-01-17 21:04:00,2139-01-22 18:00:00,OBSERVATION ADMIT,HOME HEALTH CARE,M,2144-10-28,58,4.872222,...,,"[0.9% Sodium Chloride (Mini Bag Plus), Amoxici...",30.0,11357031,primary_only,History of present illness: 58-year-old man ad...,[I5033],gpt-5-mini,"[I5033, J9692, J9691, E870, E872, E662, J441, ...","[E662, E1165, Z6833, R000, J9692, E785, I272, ..."
3,13673554,25741865,2176-02-25 19:39:00,2176-03-02 17:30:00,OBSERVATION ADMIT,HOME HEALTH CARE,M,2180-11-21,71,5.910417,...,1.0,"[Acetaminophen, Aspirin, Atorvastatin, Cepacol...",17.0,13673554,partial,History of Present Illness: 71-year-old male w...,"[N184, I5033, E1022, J918, Z9641, I739, I2510,...",gpt-5-mini,"[I5033, C9110, J918, N184, N179, I129, I2510, ...",[]
4,19017808,20589756,2183-01-05 21:59:00,2183-01-08 18:35:00,OBSERVATION ADMIT,HOME HEALTH CARE,F,2184-11-16,77,2.858333,...,,"[0.9% Sodium Chloride (Mini Bag Plus), Acetami...",25.0,19017808,partial,Brief history of present illness: 77-year-old ...,"[W19XXXA, K7460, B964, Y929, I5032, E119, N288...",gpt-5-mini,"[I5032, I8510, N179, I272, K7460, N390, I4891,...","[S5001XA, K7581, I10, I4891, N390, I878, N179,..."


In [7]:
icd_with_summary.shape

(1046, 27)

In [2]:
'''
embed the summaries

id in the chroma collection id hadm_id
store all the true icd codes so that we can see what is associated with the summary
metadata structure
{
  'icd_codes': ['abc123'],
  'discharge_summary': str
}
'''

from pydantic import BaseModel, Field
import uuid

class DischargeSummaryEmbedding(BaseModel):
  icd_codes: str = Field(..., description = 'A comma separated string of the true icd codes assigned to the summary')
  discharge_summary: str = Field(..., description = 'The text of the discharge summary')

In [3]:

docs = []
metadatas = []
# embeddings = []
ids = []
for i, row in icd_with_summary.iterrows():
  summary = row['discharge_summary']
  icd_codes = ",".join(row['true_icd_codes'])
  id = str(row['hadm_id'])
  doc = DischargeSummaryEmbedding(
    icd_codes = icd_codes,
    discharge_summary = summary
  )
  docs.append(summary)
  metadatas.append(doc.model_dump())
  ids.append(id)

In [14]:
metadatas[7]

{'icd_codes': 'I5023,I429,N179,Z6843,E669,I480,D638,L409,G8929,M25569,Z9114,Z23,N189,I2510,I340,Z95810,Z86718,Z7982,Z4502,T501X6A',
 'discharge_summary': '**Patient Information:** 33-year-old male, admitted urgently with acute on chronic systolic (congestive) heart failure (ICD10: I5023). \n\n**History of Present Illness:** The patient presented with increased dyspnea on exertion, orthopnea, and lower extremity edema. He has a history of congestive heart failure and obesity (BMI 50.0-59.9), with a recent exacerbation likely triggered by non-compliance with medications and dietary restrictions.\n\n**Hospital Course:** During a 7.7-day hospital stay, the patient was stabilized with diuretics (Furosemide, Spironolactone) to manage fluid overload. Amiodarone was initiated for paroxysmal atrial fibrillation, and Lisinopril was continued to manage hypertension and heart failure. The patient underwent daily assessments for fluid balance and renal function, which showed improvement with decrea

In [4]:
from chromadb.utils import embedding_functions
from chromadb.config import Settings
import chromadb

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name = "lokeshch19/ModernPubMedBERT")

chroma_client = chromadb.PersistentClient(path="../patient_summary_embeddings", settings=Settings(allow_reset=True))

chroma_collection = chroma_client.get_or_create_collection("patient_summary_embeddings", embedding_function=sentence_transformer_ef)

In [None]:
chroma_collection

In [5]:
chroma_collection.upsert(
    ids=ids,
    documents=docs,
    metadatas=metadatas
)

In [3]:
# test some validation queries


val_data = pd.read_parquet('../data/processed/structured_dataset_with_discharge_summaries.val.parquet')

In [4]:
val_data.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,discharge_location,gender,dod,age_at_admit,length_of_stay_days,...,procedures_icd10,num_procedures_total,meds_discharge_like,medication_count,policy,discharge_summary,diagnosis_codes,model,true_icd_codes,missing_codes
0,13917492,27765450,2114-01-16 01:58:00,2114-02-04 16:50:00,EW EMER.,HOME HEALTH CARE,M,2114-02-18,72,19.619444,...,,,"[5% Dextrose, Acetaminophen, Apixaban, Atorvas...",18.0,partial,History of Present Illness: 72-year-old man ad...,"[Z515, I428, Q6432, Z955, R570, I2510, I5023, ...",gpt-5-mini,"[I5023, N170, I21A1, R570, K7200, E43, I4821, ...","[I4821, I21A1, I255, E861, Z6822, K7200, E43, ..."
1,17302284,22205215,2164-02-03 16:48:00,2164-02-08 16:15:00,OBSERVATION ADMIT,HOME,M,2170-12-14,67,4.977083,...,,,"[5% Dextrose, Amiodarone, Aspirin, Carvedilol,...",15.0,all,History of Present Illness: 67-year-old man ad...,"[N189, E785, Z955, I4891, I2510, I255, Z951, I...",gpt-5-mini,"[I5023, I272, I4891, I255, I340, I2510, Z951, ...",[]
2,17240046,25129064,2178-11-19 15:21:00,2178-11-23 16:05:00,OBSERVATION ADMIT,HOME,M,2180-10-10,61,4.030556,...,"[047K3ZZ, 047M3ZZ, 0Y6T0Z3, 5A1D70Z]",4.0,"[Acetaminophen, Aspirin, Atorvastatin, Calcium...",35.0,primary_only,History of Present Illness: 61-year-old male w...,[E1152],gpt-5-mini,"[E1152, N186, I70261, I120, E1122, D631, G546,...","[G546, Z951, I120, K219, Z89612, Z992, D631, I..."
3,15346570,27869814,2164-11-05 17:11:00,2164-11-12 12:44:00,OBSERVATION ADMIT,HOME HEALTH CARE,M,NaT,75,6.814583,...,"[0Y6P0Z1, B41FYZZ]",2.0,"[Acetaminophen, Apixaban, Aspirin, Bisacodyl, ...",27.0,partial,History of Present Illness: 75-year-old man ad...,"[L97512, E11621, Z7902, M86171, I2510, E1152, ...",gpt-5-mini,"[E1169, E1152, M86171, I70261, Z7902, I10, I25...",[I10]
4,16273050,26152108,2151-06-11 17:44:00,2151-06-21 12:46:00,EW EMER.,HOME HEALTH CARE,F,NaT,33,9.793056,...,,,"[5% Dextrose, Acetaminophen, Bisacodyl, DULoxe...",24.0,partial,Brief history of present illness: 33-year-old ...,"[E1143, I10, Z794, E1165, K224]",gpt-5-mini,"[E1143, E1165, I10, E441, M797, K3184, Z794, Z...","[D509, E1140, E441, K219, M797, E669, Z6839, Z..."


In [9]:
for i, row in val_data[:5].iterrows():
  print(pd.isna(row['diagnosis_codes'][0]))

False
False
False
False
False


In [5]:
val_data.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'admission_type',
       'discharge_location', 'gender', 'dod', 'age_at_admit',
       'length_of_stay_days', 'icd10_codes', 'icd10_descriptions',
       'num_icd10_codes', 'primary_icd10', 'icd_version', 'primary_icd10_desc',
       'primary_pdgm_bucket_simple', 'procedures_icd10',
       'num_procedures_total', 'meds_discharge_like', 'medication_count',
       'policy', 'discharge_summary', 'diagnosis_codes', 'model',
       'true_icd_codes', 'missing_codes'],
      dtype='object')

In [8]:
val_data['icd10_codes'].apply(lambda x: x.tolist()) == val_data['true_icd_codes'].apply(lambda x: x.tolist())

0      True
1      True
2      True
3      True
4      True
       ... 
218    True
219    True
220    True
221    True
222    True
Length: 223, dtype: bool

In [8]:
summaries = val_data['discharge_summary'].iloc[:5]
codes = val_data['true_icd_codes'].iloc[:5]

results = chroma_collection.query(
    query_texts = summaries.tolist(),
    # n_results = 5
)

In [None]:
results['metadatas'][0][-1]

[{'icd_codes': 'I130,I5023,I21A1,N179,N184,I050,Z952,I2510,Z951,E1121,J449,E785,G2581,K219,Z794,F419,F329,Z87891',
  'discharge_summary': 'History of present illness: 67-year-old man with a history of prosthetic mitral valve, prior CABG, ischemic heart disease, type 2 diabetes with nephropathy, COPD and known chronic kidney disease (stage 4) was admitted urgently for acute decompensated systolic heart failure with volume overload and concurrent acute kidney injury. Troponin elevation consistent with type 2 myocardial infarction was also noted during admission.\n\nHospital course: Over a 9.7-day hospitalization the patient received guided IV diuresis with careful electrolyte and renal monitoring, afterload reduction with hydralazine and isosorbide mononitrate for symptomatic ischemia and blood pressure control, optimization of glycemic control with insulin, and inhaled bronchodilator therapy for COPD exacerbation. Serial cardiac enzymes and telemetry were monitored; no ischemic interven

In [25]:
results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances'])

In [10]:
results['distances'][0]

[0.11317634582519531,
 0.12990045547485352,
 0.13044601678848267,
 0.13913506269454956,
 0.13949400186538696,
 0.140264630317688,
 0.14460068941116333,
 0.15482497215270996,
 0.16387319564819336,
 0.16665416955947876]

In [11]:
print(results['documents'][0][-1])

History of Present Illness: 64-year-old male admitted under observation for acute on chronic heart failure in the setting of hypertensive heart disease and chronic kidney disease (CKD stage 4). He presented with progressive dyspnea, lower extremity edema, and poorly controlled blood pressure. Significant comorbidities include atrial septal defect, type 2 diabetes mellitus with diabetic CKD, COPD/OSA, prior colorectal malignancy, hyperlipidemia, and anemia of CKD.

Hospital Course: Length of stay 7.2 days. Patient received diuretic optimization with symptomatic improvement in congestion, careful volume management given CKD stage 4, and blood pressure control with continuation/adjustment of carvedilol and addition/titration of hydralazine as indicated. Glycemic management was achieved with basal-bolus insulin while inpatient. Inhaled bronchodilator and inhaled corticosteroid/long-acting beta-agonist therapy were continued for COPD. Antiplatelet therapy (aspirin) was continued; prophylact

In [17]:
sorted(results['metadatas'][0][-1]['icd_codes'].split(","))

['C182',
 'D122',
 'D123',
 'D509',
 'D631',
 'E039',
 'E1122',
 'E1165',
 'E559',
 'E785',
 'F329',
 'G3184',
 'G4733',
 'I130',
 'I2510',
 'I5033',
 'I69922',
 'J449',
 'K219',
 'M109',
 'M1990',
 'N184',
 'Q211',
 'Z20822',
 'Z794',
 'Z87891',
 'Z9111',
 'Z9181']

In [12]:
print(summaries[0])

History of Present Illness: 72-year-old man admitted emergently with acute on chronic systolic heart failure (I50.23) complicated by cardiogenic shock and type 2 myocardial infarction. Admission course was notable for acute tubular necrosis (AKI, N17.0), severe protein-calorie malnutrition, hyponatremia, and hepatic dysfunction. Prior history includes permanent atrial fibrillation and chronic ischemic heart disease.

Hospital Course: Patient required hemodynamic support with dobutamine infusion for cardiogenic shock and continuous telemetry. Over a 19.6-day hospitalization he was diuresed judiciously, received careful electrolyte and volume management, and required temporary renal support measures while AKI improved but did not fully normalize. Nutrition was optimized with dietitian involvement and oral nutrition supplements for severe malnutrition. Hyponatremia and hepatic chemistries trended toward baseline with supportive care. Antithrombotic strategy was implemented given atrial fi

In [19]:
sorted(codes[0])

['E43',
 'E861',
 'E871',
 'E872',
 'F4310',
 'I21A1',
 'I2510',
 'I255',
 'I340',
 'I428',
 'I4821',
 'I5023',
 'K5900',
 'K7200',
 'N170',
 'Q6432',
 'R570',
 'Z20822',
 'Z515',
 'Z66',
 'Z6822',
 'Z7901',
 'Z955']