In [1]:
# Install all relevant packages
!pip install transformers
!pip install torch
!pip install pip install mendelai-brat-parser
!pip install smart-open
!pip install -U scikit-learn



In [2]:
# Import libraries
import transformers
import torch
import torch.nn as nn
import itertools

from transformers import BertTokenizer, BertForTokenClassification, BertModel
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizerFast,  BatchEncoding
from tokenizers import Encoding
from transformers import AutoTokenizer, AutoModel


from brat_parser import get_entities_relations_attributes_groups
import zipfile
import os

from dataclasses import dataclass
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerFast
from dataclasses import dataclass
from typing import List
from torch.utils.data.dataloader import DataLoader
from transformers import BertForTokenClassification, AdamW
from torch.nn import functional as F
from sklearn.metrics import f1_score
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read in clinical notes file

clinical_notes = pd.read_csv("../Data/MIMIC_resources/NOTEEVENTS.csv")


  clinical_notes = pd.read_csv("../Data/MIMIC_resources/NOTEEVENTS.csv")


In [4]:
# Text column is the section of interest
# Creating a subsampled dataframe as an example with the first 1000 notes

subsampled_notes_df = clinical_notes.head(10)
print(subsampled_notes_df.columns)

clin_notes = subsampled_notes_df['TEXT'].tolist()

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
       'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT'],
      dtype='object')


In [5]:
clin_notes[0]

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

# Ensure CUDA is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU details:", torch.cuda.get_device_name(0))

# *************************************************************************

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# *************************************************************************

# Move the model to the selected device (GPU or CPU)
model.to(device)

# *************************************************************************

embeddings = []
counter = 0


# *************************************************************************

with torch.no_grad():
    for note in clin_notes:
        counter += 1
        # Tokenize the note, ensuring to specify padding, truncation, and max length
        inputs = tokenizer(note, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Move the inputs to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate embeddings
        outputs = model(**inputs)
        
        # Get the last layer's hidden states
        hidden_states = outputs.last_hidden_state
        
        # Get the embedding for the [CLS] token (first token)
        cls_embedding = hidden_states[:, 0, :]  # Select the first token ([CLS])
        
        # Move the embeddings back to CPU for further processing or storage
        cls_embedding_cpu = cls_embedding.squeeze().to('cpu').tolist()
        
        embeddings.append(cls_embedding_cpu)



PyTorch version: 2.2.1+cu121
CUDA available: False


In [7]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

embeddings = []
counter = 0
with torch.no_grad():
    for note in clin_notes:
        counter += 1
        #print("Going through note:", counter)
        inputs = tokenizer(note, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        # Get the last layer's hidden states
        hidden_states = outputs.last_hidden_state
        # Get the embedding for the [CLS] token (first token)
        cls_embedding = hidden_states[:, 0, :]  # Select the first token ([CLS])
        #print(cls_embedding.squeeze().tolist())
        #print(len(cls_embedding.squeeze().tolist()))
        embeddings.append(cls_embedding.squeeze().tolist())



# Create a DataFrame containing the clinical notes and their embeddings
data = {"Clinical Note": subsampled_notes_df['ROW_ID'].tolist(), "Embedding": embeddings}


embeddings_df = pd.DataFrame(data)




In [8]:
print(embeddings_df.head())

   Clinical Note                                          Embedding
0            174  [0.4195939600467682, 0.12878939509391785, -0.1...
1            175  [-0.12788911163806915, -0.009227218106389046, ...
2            176  [-0.06606652587652206, 0.21083964407444, -0.42...
3            177  [0.0261161457747221, 0.2762588560581207, -0.41...
4            178  [0.216208353638649, -0.03747297823429108, -0.6...


In [9]:
random_record = embeddings_df.sample(n=1)  # This returns a DataFrame with one randomly selected row
random_record['Embedding'].tolist()

[[0.0261161457747221,
  0.2762588560581207,
  -0.4113757312297821,
  0.3052580654621124,
  -0.13184183835983276,
  -0.12275976687669754,
  0.026429856196045876,
  0.1561032086610794,
  0.5078171491622925,
  -0.344385027885437,
  -0.03216106817126274,
  0.40418559312820435,
  -0.196963369846344,
  0.007182513363659382,
  0.08429527282714844,
  -0.02628474123775959,
  -0.07020526379346848,
  0.38187283277511597,
  0.17313796281814575,
  -0.2423737347126007,
  -0.25155818462371826,
  -0.04098164290189743,
  -0.22683309018611908,
  0.045699432492256165,
  -0.18964053690433502,
  -0.3320840299129486,
  0.16241316497325897,
  0.8446181416511536,
  0.1864463835954666,
  0.3377116322517395,
  -0.278943806886673,
  0.38990727066993713,
  -0.4589537978172302,
  -0.08904831111431122,
  -0.312784880399704,
  0.11458893120288849,
  0.0161126721650362,
  0.21143455803394318,
  0.022208066657185555,
  0.19419518113136292,
  -0.25883203744888306,
  -0.019731085747480392,
  0.7549178600311279,
  0.2259