In [1]:
# Install all relevant packages
!pip install transformers
!pip install torch
!pip install pip install mendelai-brat-parser
!pip install smart-open
!pip install -U scikit-learn

Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl.metadata (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.21.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m722.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4

In [1]:
# Import libraries
import transformers
import torch
import torch.nn as nn
import itertools

from transformers import BertTokenizer, BertForTokenClassification, BertModel
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizerFast,  BatchEncoding
from tokenizers import Encoding
from transformers import AutoTokenizer, AutoModel


from brat_parser import get_entities_relations_attributes_groups
import zipfile
import os

from dataclasses import dataclass
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerFast
from dataclasses import dataclass
from typing import List
from torch.utils.data.dataloader import DataLoader
from transformers import BertForTokenClassification, AdamW
from torch.nn import functional as F
from sklearn.metrics import f1_score
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd

ModuleNotFoundError: No module named 'transformers'

In [5]:
# Read in clinical notes file
import pandas as pd
clinical_notes = pd.read_csv("../Data/MIMIC_resources/MODIFIED_NOTEEVENTS_2083180.csv")
clinical_notes.head(5)

In [None]:
# Text column is the section of interest
# Creating a subsampled dataframe as an example with the first 1000 notes

subsampled_notes_df = clinical_notes.head(10)
print(subsampled_notes_df.columns)

clin_notes = subsampled_notes_df['TEXT'].tolist()

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
       'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT'],
      dtype='object')


In [None]:
clin_notes[0]

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model     =     AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

embeddings = []
counter = 0
with torch.no_grad():
    for note in clin_notes:
        counter += 1
        #print("Going through note:", counter)
        inputs = tokenizer(note, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        # Get the last layer's hidden states
        hidden_states = outputs.last_hidden_state
        # Get the embedding for the [CLS] token (first token)
        cls_embedding = hidden_states[:, 0, :]  # Select the first token ([CLS])
        #print(cls_embedding.squeeze().tolist())
        #print(len(cls_embedding.squeeze().tolist()))
        embeddings.append(cls_embedding.squeeze().tolist())



# Create a DataFrame containing the clinical notes and their embeddings
data = {"Clinical Note": subsampled_notes_df['ROW_ID'].tolist(), "Embedding": embeddings}


embeddings_df = pd.DataFrame(data)

In [None]:
print(embeddings_df.head())

   Clinical Note                                          Embedding
0            174  [0.41959381103515625, 0.12878957390785217, -0....
1            175  [-0.1278892457485199, -0.009227210655808449, -...
2            176  [-0.06606630980968475, 0.21083956956863403, -0...
3            177  [0.026115955784916878, 0.27625882625579834, -0...
4            178  [0.21620851755142212, -0.03747304901480675, -0...


In [None]:
random_record = embeddings_df.sample(n=1)  # This returns a DataFrame with one randomly selected row
random_record['Embedding'].tolist()

[[-0.10072151571512222,
  0.23745352029800415,
  -0.27874448895454407,
  0.032923657447099686,
  0.03463217616081238,
  0.015540647320449352,
  0.1555711030960083,
  0.046086329966783524,
  0.5651464462280273,
  -0.2864580750465393,
  -0.2561330199241638,
  0.2596397399902344,
  -0.3969949781894684,
  -0.007077665533870459,
  -0.07601746171712875,
  0.037903718650341034,
  0.011287973262369633,
  0.3749655783176422,
  0.007569611072540283,
  -0.12798888981342316,
  -0.27459952235221863,
  0.08309374749660492,
  -0.3416343033313751,
  -0.14199139177799225,
  -0.2138790637254715,
  -0.047468867152929306,
  0.05355123430490494,
  0.7388127446174622,
  0.3217769265174866,
  0.31490808725357056,
  -0.06379248946905136,
  0.4187043309211731,
  -0.48886170983314514,
  -0.02133927308022976,
  -0.3510068655014038,
  0.26868000626564026,
  0.41199201345443726,
  0.24516041576862335,
  0.18829375505447388,
  0.3625430464744568,
  -0.3099880516529083,
  0.14292167127132416,
  0.5265316367149353,
 