In [16]:
import pandas as pd

def merging(df1, df2):
    df2 = df2.rename(columns={'summarized TEXT': 'TEXT'})
    merged_df = pd.merge(df1, df2, on=['SUBJECT_ID', 'HADM_ID'], suffixes=('_text', '_summary'))
    merged_df.rename(columns={'TEXT_text': 'TEXT', 'TEXT_summary': 'SUMMARY'}, inplace=True)
    return merged_df

summarizing_models = ['1_t5_small2', '3_bart_large_cnn','4_medical_summarization']
for mdl in summarizing_models:
    df1 = pd.read_csv(f'../../../Data/unstructured/text/ALL_first_last.csv')
    df2 = pd.read_csv(f'../../../Data/unstructured/summarized/ALL_first_last_SUMMARY_ONLY/{mdl}.csv')

    merged_DF = merging(df1, df2)[['SUBJECT_ID', 'HADM_ID', 'TEXT', 'SUMMARY']]
    merged_DF.to_csv(f'../../../Data/unstructured/summarized/{mdl}.csv', index=False)
    print(merged_DF.shape)
merged_DF

(42142, 4)
(42142, 4)
(42142, 4)


Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,SUMMARY
0,58526.0,100001.0,Nursing:\n.\nNursing/Other:\n.\nRadiology:\n[*...,eval for infiltrate __________________________...
1,54610.0,100003.0,Nursing:\n59 year old male with HCV-related ci...,background : nursing : 59 year old male with H...
2,9895.0,100006.0,Nursing:\n.\nNursing/Other:\nPMICU Nursing Pro...,a 48 y/o woman admitted to MICU from ED for re...
3,,100007.0,Nursing:\n.\nNursing/Other:\nadmit note\npt ad...,"pt admitted to sicu s/p exploratory lap, lysis..."
4,533.0,100009.0,Nursing:\n.\nNursing/Other:\n.\nRadiology:\n[*...,a 60 year old man with CHD REASON FOR THIS EXA...
...,...,...,...,...
42137,20785.0,199993.0,Nursing:\n.\nNursing/Other:\nCSRU Admission No...,background : pt safety is a key component of a...
42138,23761.0,199994.0,Nursing:\n.\nNursing/Other:\nNURSING ADM NOTE\...,"aDM - based adolescents ( a ) , a subset of ad..."
42139,19412.0,199995.0,Nursing:\n.\nNursing/Other:\nNEURO: PT ARRIVED...,a total of 61 - 80% of the population in the u...
42140,27200.0,199998.0,Nursing:\n.\nNursing/Other:\nResp Care\n\nPt r...,weaned off and pt awakened. ecchymosis to R hi...


In [17]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import os


def generate_embeddings_batch(texts, tokenizer, model, device):
    """
    Generate CLS token embeddings for a batch of texts using BioClinicalBERT.
    """
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().tolist()
    return cls_embeddings

if __name__ == "__main__":
    # Load BioClinicalBERT model
    print("Loading BioClinicalBERT model...")
    tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print("Model loaded.")
        
    # ===============================================================================
    # File paths
    thePath     = os.getenv('thePath', '../../../Data/unstructured')
    smrz_model  = os.getenv('smrz_model', '1_t5_small2')
    input_file  = os.getenv('input_file',  'ALL_first_last')
    
    input_path  = f'{thePath}/summarized/{smrz_model}.csv'
    output_path = f'{thePath}/emb/{smrz_model}.csv'
    
    # ===============================================================================
    # Load input data
    print("Loading input data...")
    data = pd.read_csv(input_path, dtype={"HADM_ID": str, "SUBJECT_ID": str}).head(10)
    print(f"Dataset loaded. Total rows: {len(data)}")

    # ===============================================================================
    # Ensure text fields are non-null and converted to strings
    print("Preprocessing text columns...")
    data["TEXT"]    = data["TEXT"].fillna("").astype(str)
    data["SUMMARY"] = data["SUMMARY"].fillna("").astype(str)

    # ===============================================================================
    # Initialize storage for embeddings
    emb_text, emb_summary = [], []
    batch_size = 32  # Adjust based on GPU memory

    # ===============================================================================
    # Generate embeddings for 'text'
    print("Generating embeddings for 'text' column...")
    for i in tqdm(range(0, len(data), batch_size), desc="Processing 'text'"):
        batch_texts = data["TEXT"][i:i+batch_size].tolist()
        batch_embeddings = generate_embeddings_batch(batch_texts, tokenizer, model, device)
        emb_text.extend(batch_embeddings)

    # ===============================================================================
    # Generate embeddings for 'summary'
    print("Generating embeddings for 'summary' column...")
    for i in tqdm(range(0, len(data), batch_size), desc="Processing 'summary'"):
        batch_summaries = data["SUMMARY"][i:i+batch_size].tolist()
        batch_embeddings = generate_embeddings_batch(batch_summaries, tokenizer, model, device)
        emb_summary.extend(batch_embeddings)

    # ===============================================================================
    # Create new dataframe with embeddings
    print("Creating final dataframe with embeddings...")
    result_df = pd.DataFrame({
        "HADM_ID": data["HADM_ID"],
        "SUBJECT_ID": data["SUBJECT_ID"],
        "EMB_TEXT": emb_text,
        "EMB_SUMMARY": emb_summary
    })

    # ===============================================================================
    # # Save to CSV
    # print(f"Saving embeddings to {output_path}...")
    # result_df.to_csv(output_path, index=False)
    # print(f"Embeddings successfully saved to {output_path}.")
    result_df

  from .autonotebook import tqdm as notebook_tqdm


Loading BioClinicalBERT model...
Model loaded.
Loading input data...
Dataset loaded. Total rows: 10
Preprocessing text columns...
Generating embeddings for 'text' column...


Processing 'text': 100%|██████████| 1/1 [00:12<00:00, 12.85s/it]


Generating embeddings for 'summary' column...


Processing 'summary': 100%|██████████| 1/1 [00:01<00:00,  1.67s/it]

Creating final dataframe with embeddings...





In [None]:
data

Unnamed: 0.1,Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,SUMMARY
0,0,100003.0,54610,"59 year old male with HCV-related cirrhosis, g...","59-year-old male with cirrhosis, grade I/II es..."
1,1,100018.0,58128,Impaired Physical Mobility\n Assessment:\n ...,patient able to lift & hold upper extremities ...
2,2,100020.0,9973,58yom withl progressive MS since [**2119**] on...,58yom withl progressive MS since [**2119**] on...
3,3,100063.0,69282,Pt is 46y/o male who was found lying outside a...,"while in the triage area, pt somehow fell out ..."
4,4,100068.0,9889,74 y/o CABGx4 on [**2192-1-13**]. EZ TUBE. Di...,pt on betablockade and known to have heart blo...
5,5,100072.0,50379,"36M, DMI, insulin pump, normally well controll...","36M, DMI, insulin pump, normally well controll..."
6,6,100116.0,55105,Pt s/p 30 foot fall from [**Doctor Last Name *...,"head CT shows left basalar skull fx, right SAH..."
7,7,100117.0,14297,"Pneumonia, bacterial, community acquired (CAP)...","CPT done w/ repositioning in bed, encouraged t..."
8,8,100126.0,75970,TITLE: HPI:\n Pt is a 51 yo M tx from [**Hos...,Pt is a 51 yo M tx from [**Hospital3 362**] s/...
9,9,100133.0,70485,"HPI:\n 45yo M w/EtOH cirrhosis, MELD 28, Chi...",denies change in health since previous admissi...


In [18]:
result_df

Unnamed: 0,HADM_ID,SUBJECT_ID,EMB_TEXT,EMB_SUMMARY
0,100001.0,58526.0,"[-0.06878875941038132, -0.0074307480826973915,...","[0.19394980370998383, 0.28114089369773865, 0.0..."
1,100003.0,54610.0,"[0.11134085804224014, 0.23517034947872162, -0....","[0.22276471555233002, 0.11342918127775192, 0.0..."
2,100006.0,9895.0,"[-0.18066149950027466, 0.18932631611824036, -0...","[0.04641161486506462, -0.1912076324224472, -0...."
3,100007.0,,"[-0.1941411793231964, 0.12399176508188248, -0....","[0.06885648518800735, 0.3670652210712433, -0.5..."
4,100009.0,533.0,"[0.10732075572013855, -0.08329775929450989, -0...","[0.06423552334308624, -0.018912216648459435, -..."
5,100010.0,55853.0,"[-0.17905670404434204, -0.04523336887359619, -...","[-0.008342467248439789, 0.19690968096256256, -..."
6,100011.0,87977.0,"[-0.078480064868927, -0.13545627892017365, -0....","[0.13416405022144318, 0.2285039722919464, 0.26..."
7,100012.0,60039.0,"[0.019372183829545975, -0.13762210309505463, 0...","[0.2605943977832794, 0.0402924083173275, 0.428..."
8,100016.0,68591.0,"[0.21100114285945892, -0.03668534383177757, 0....","[0.5256306529045105, -0.1084759533405304, -0.0..."
9,100018.0,58128.0,"[0.178505077958107, 0.14203724265098572, -0.29...","[0.28787025809288025, -0.07443398982286453, -0..."
