## LLM on MIMIC notes

In [9]:
from transformers import AutoTokenizer, AutoModel

# Load the pre-trained model and tokenizer

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Sample text
text = "Your specific text goes here."

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt")

# Pass the tokens through the model to obtain embeddings

outputs = model(**inputs)

# The embeddings are present in the output dictionary
# You can access the embeddings using outputs.last_hidden_state or outputs.pooler_output
# For example, if you want the embeddings from the last layer:

embeddings = outputs.last_hidden_state

# Now, 'embeddings' contains the word embeddings for the input text
tokenized_input = tokenizer.tokenize(text)
print("Tokenized input:", tokenized_input)

# Tokenize the text
tokenized_text = tokenizer.tokenize(text)

# Find the token corresponding to "Your"
target_token = "your"
target_token_index = tokenized_text.index(target_token)

print("Token corresponding to 'Your':", tokenized_text[target_token_index])

Tokenized input: ['your', 'specific', 'text', 'goes', 'here', '.']
Token corresponding to 'Your': your


In [2]:
embeddings.shape, 

# 1: batch size
# 8: sequence length (number of tokens)
# 768: each token has a 768 vector!

torch.Size([1, 8, 768])

In [3]:
embeddings


tensor([[[ 0.5283, -0.2056, -0.3272,  ..., -0.0900,  0.4362, -0.1856],
         [ 0.2664, -0.1394,  0.0100,  ...,  0.2744,  0.6947,  0.1007],
         [ 0.3983,  0.2867,  0.1368,  ...,  0.4208,  0.2904, -0.1188],
         ...,
         [ 0.5439, -0.1960, -0.1821,  ...,  0.0773,  0.7764, -0.7254],
         [ 0.1831, -0.3898,  0.0782,  ...,  0.2574,  0.1892, -0.3407],
         [ 1.0484,  0.7031, -0.3386,  ..., -0.0218,  1.0521, -0.0525]]],
       grad_fn=<NativeLayerNormBackward0>)

In [10]:
# Create a dictionary to store word embeddings
word_embeddings = {}

# Iterate over tokens and their corresponding embeddings
for token, embedding in zip(tokenized_input, embeddings[0]):
    # Convert tensor to list for better handling
    embedding = embedding.tolist()
    # Store the embedding in the dictionary
    word_embeddings[token] = embedding

{'your': [0.528301477432251, -0.20562493801116943, -0.3272034227848053, -0.026769138872623444, 0.019964981824159622, 0.2810690402984619, 0.5092201232910156, -0.10947103798389435, 0.3674233555793762, 0.011772990226745605, -0.2508663535118103, 0.02759428508579731, -0.3169240653514862, -0.2867757976055145, -0.364435076713562, 0.389488160610199, -0.0492461621761322, -0.20744669437408447, -0.5321357846260071, -0.12328062206506729, -0.05113980546593666, 0.27067506313323975, 0.3962523341178894, -0.3049863874912262, 0.26655855774879456, -0.4048808813095093, 0.1573331654071808, 0.4712720215320587, 0.054230302572250366, 0.4550130367279053, 0.21051621437072754, 0.10328268259763718, 0.26480549573898315, -0.08462903648614883, -0.34896010160446167, 0.360117644071579, -0.14722365140914917, 0.6770817041397095, -0.5419507026672363, 0.01930011808872223, -0.10918217152357101, -0.07052287459373474, 1.0475046634674072, -0.37579330801963806, 0.03522072359919548, -0.7381231784820557, 0.4011487364768982, 0.32

In [16]:
# Get the mean pooling of token embeddings across all tokens in the document
import torch

document_embedding = torch.mean(embeddings, dim=1)

# Now document_embedding is the document-level embedding for the input text
print(document_embedding.shape)

torch.Size([1, 768])
