In [1]:
import pandas as pd
import re
import zipfile
from zipfile import ZipFile
import numpy as np

In [2]:
#with ZipFile('mimic-iv-note-deidentified-free-text-clinical-notes-2.2.zip', 'r') as f:
    #f.extractall()

In [3]:
#reading MIMIC IV note dataset
df = pd.read_csv('mimic-iv-note-deidentified-free-text-clinical-notes-2.2/note/discharge.csv.gz')
#df.drop(df.index[100:], inplace=True)

In [4]:
print(df[r'text'][0])

 
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___
 
Chief Complaint:
Worsening ABD distension and pain 
 
Major Surgical or Invasive Procedure:
Paracentesis

 
History of Present Illness:
___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, 
bioplar, PTSD, presented from OSH ED with worsening abd 
distension over past week.  
Pt reports self-discontinuing lasix and spirnolactone ___ weeks 
ago, because she feels like "they don't do anything" and that 
she "doesn't want to put more chemicals in her." She does not 
follow Na-restricted diets. In the past week, she notes that she 
has been having worsening abd distension and discomfort. She 
denies ___ edema, or SOB, or orthopnea. She denies f/c/n/v, d/c, 
dysuria. She had food poisoning a week ago from eating stale 
cake (n/v 20 min after fo

In [5]:
#sentence tokenization: splitting text separated by a period, new line, or semicolon
allSentences = []
for text in df[r'text']:
    allSentences.append(re.split(r'\.|\n|;', text))

In [6]:
df[r'sentence_tokens'] = allSentences

In [7]:
df[r'sentence_tokens'][2]

[' ',
 'Name:  ___                     Unit No:   ___',
 ' ',
 'Admission Date:  ___              Discharge Date:   ___',
 ' ',
 'Date of Birth:  ___             Sex:   F',
 ' ',
 'Service: MEDICINE',
 ' ',
 'Allergies: ',
 'Percocet / Vicodin',
 ' ',
 'Attending: ___',
 ' ',
 'Chief Complaint:',
 'altered mental status',
 ' ',
 'Major Surgical or Invasive Procedure:',
 'none',
 ' ',
 'History of Present Illness:',
 'Mrs',
 ' ___ is a ___ female with HIV on HAART, COPD, HCV ',
 'cirrhosis complicated by ascites and hepatic encephalopathy who ',
 'initially presented to the ED yesterday with hypotension after a ',
 'paracentesis',
 '  ',
 'The patient has had accelerated decompensation of her cirrhosis ',
 'recently with worsening ascites, and she is maintained on twice ',
 'weekly paracentesis',
 ' She was at her regular session yesterday ',
 'when she had hypotension to SBP ___ and felt lightheadedness',
 ' ',
 "Per the patient, that's when her memory started to get fuzzy",
 ' ',
 'Sh

In [8]:
#manual keyword definitions
#common synonyms found on NCBO Bioportal Class Mappings
keywords = [r"(myocardial|cardiac) infarct",
            r"(myocardial|coronary|cardiac) necrosis",
            r"heart (attack|infarct|rupture)",
            r"(attack|attacks|attacking) (heart|coronary)",
            r"coronary attack",
            r"(coronary|coronary artery) (occlusion|rupture)",
            r"\bMI\b",
            r"\bAMI\b",
            r"(infarction|infarction,|infarction;|infarct|infarctions,|infarcts|infarctions|infarct,|infarcts,) (myocardial|heart)",
            r"(infarction|infarct|rupture) of (heart|the heart|myocardium|the myocardium)",
            r"(cardiac|heart|myocardial) failure",
            r"failure heart",
            r"myocarditis",
            r"pericarditis",
            r"\bHF\b",
            r"ischemic heart disease",
            r"myocardial ischemia",
            r"\bIHD\b",
            r"coronary syndrome",
            r"\bACS\b",
            r"stroke",
            r"cerebrovascular accident",
            r"cerebral infarction",
            r"\bCVA\b"]

In [9]:
#5-Point MACE Definition
main_terms = ['myocardial infarction','coronary syndrome','ischemic heart disease','heart failure','stroke']

In [10]:
main_terms_df = pd.DataFrame({'main_terms': main_terms})
main_terms_df.head()

Unnamed: 0,main_terms
0,myocardial infarction
1,coronary syndrome
2,ischemic heart disease
3,heart failure
4,stroke


In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from numpy.linalg import norm

In [12]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

In [13]:
def get_bert_embeddings(tokens_tensor, segments_tensors,model):# 
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs['last_hidden_state'].squeeze()#outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer(stop_words = 'english')#ensures that fit_transform automatically removes stopwords

In [None]:
#collect all unique words from each MIMIC IV report
words_collected = ""
for sentences in df['sentence_tokens']:
    for sentence in sentences:
        words_collected += " "
        words_collected += sentence
all_words = []
all_words.append(words_collected)

In [None]:
#DataFrame of all unique words
model = vectorizer.fit_transform(all_words)
all_words_df = pd.DataFrame(vectorizer.get_feature_names_out(), columns = ['words'])
#print(len(all_words_df['words']))
all_words_df.tail()

In [None]:
#list of stopwords that have already been removed by fit_transform method 
stop_words = vectorizer.get_stop_words()
print(stop_words)

In [None]:
from collections import Counter
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
c = Counter(list(all_words_df['words']))
top_10000_words = c.most_common()[:10000]

In [None]:
#creating final list of unique & most commmon words 
final = list(filter(lambda x: nltk.pos_tag([x[0]])[0][1] == "NN" and len(x[0]) > 4, top_10000_words))
final = [x[0] for x in final]

In [None]:
final

In [None]:
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModel.from_pretrained("medicalai/ClinicalBERT")

In [None]:
#storing embeddings for every unique word
dct_final = {}
for text in final:
    tokenized_text, tokens_tensor, segments_tensor = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensor, model)
    dct_final[text] = list_token_embeddings

In [None]:
#storing the vectors with their associated words in the DataFrame
final_df = pd.DataFrame(final, columns = ['words'])
final_vectors = []
for i in range(0, len(final)):
    final_vectors.append(dct_final[final[i]])

final_df['vectors'] = final_vectors
final_df.tail()

In [None]:
#creating vectors for each main MACE term
dct_main = {}
for text in main_terms:
    tokenized_text, tokens_tensor, segments_tensor = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensor, model)
    dct_main[text] = list_token_embeddings

In [None]:
main_vectors = []
for i in range(0, len(main_terms)):
    main_vectors.append(dct_main[main_terms[i]])

main_terms_df['vectors'] = main_vectors
main_terms_df.head()

In [None]:
#generating similarity between every unique term and every main MACE term

similarities = [[],[],[],[],[]]

keys_final = list(dct_final.keys())
for kidx in range(len(keys_final)):
    a = dct_final[keys_final[kidx]]
    keys_main = list(dct_main.keys())
    for jidx in range(len(keys_main)):
        b = dct_main[keys_main[jidx]]
        similarities[jidx].append(np.dot(a,b)/(norm(a)*norm(b)))

In [None]:
final_df['myocardial_infarct_similarity'] = similarities[0]
final_df['coronary_syndrome_similarity'] = similarities[1]
final_df['ischemic_heart_disease_similarity'] = similarities[2]
final_df['heart_failure_similarity'] = similarities[3]
final_df['stroke_similarity'] = similarities[4]

In [None]:
final_df.head()

In [None]:
#Finding the most similar words to the main terms

N = 5
for i in range(5):
    res = sorted(range(len(similarities[i])), key = lambda sub: similarities[i][sub])[-N:]
    print("Terms similar to " + main_terms[i] + ":")
    for index in res:
        print(final_df['words'][index])
    print()


In [None]:
#relevant sentence extraction
keywords = ["myocardial infarct","heart attack","heart failure","cardiac failure","ischemic heart disease",
            "coronary syndrome", "stroke","cerebrovascular accident","cerebral infarction",r"\bMI\b",r"\bAMI\b",
            r"\bHF\b",r"\bIHD\b",r"\bACS\b",r"\bCVA\b"]
relevant_sentences = []
for sentences in df['sentence_tokens']:
    current_sentences = []
    for sentence in sentences:
        flag = 0
        for keyword in keywords:
            match = re.search(keyword, sentence, re.I)
            if match != None:
                flag = 1
                break;
        if flag == 1:
            current_sentences.append(sentence)
    relevant_sentences.append(current_sentences)

In [None]:
df['relevant_sentences'] = relevant_sentences

In [None]:
for i in range(100):
    print(df['relevant_sentences'][i])

In [None]:
#!pip install pyConTextNLP

In [None]:
import pyConTextNLP.pyConText as pyConText
import pyConTextNLP.itemData as itemData
import networkx as nx

In [None]:
#itemData definitions
modifiers = itemData.get_items(
    "file:///mnt/storage/MACE_extraction/pyConText_itemData/MACE_modifiers.txt")
targets = itemData.get_items(
    "file:///mnt/storage/MACE_extraction/pyConText_itemData/MACE_targets.txt")

In [None]:
def markup_sentence(s, modifiers, targets, prune_inactive=True):
    markup = pyConText.ConTextMarkup()
    markup.setRawText(s)
    markup.cleanText()
    markup.markItems(modifiers, mode="modifier")
    markup.markItems(targets, mode="target")
    markup.pruneMarks()
    markup.dropMarks('Exclusion')
    markup.applyModifiers()
    markup.pruneSelfModifyingRelationships()
    if prune_inactive:
        markup.dropInactiveModifiers()
    return markup

In [None]:
#removing negation
list_of_values = []
for sentences_per_report in df['relevant_sentences']:
    list_to_add = []
    for sent in sentences_per_report:
        if len(markup_sentence(sent.lower(), modifiers, targets).edges()) < 1:
            list_to_add.append(sent)
        else:
            is_positive = True
            for edge in (markup_sentence(sent.lower(), modifiers, targets).edges()):
                if str(edge[0]).__contains__('definite_negated_existence') or str(edge[0]).__contains__('probable_negated_existence'):
                    is_positive = False
                    break
            if is_positive:
                list_to_add.append(sent)
    list_of_values.append(list_to_add)

In [None]:
df['only_positive'] = list_of_values
for i in range(100):
    print(df['relevant_sentences'][i])
    print(df['only_positive'][i])
    print()

In [None]:
#MACE Classification
historical = []
definite_existence = []
uncertain_existence = []
risk = []

for sentences_per_report in df['only_positive']:
    historical_to_add = []
    definite_existence_to_add = []
    uncertain_existence_to_add = []
    risk_to_add = []
    
    for sent in sentences_per_report:
        if len(markup_sentence(sent.lower(), modifiers, targets).edges()) < 1:
            definite_existence_to_add.append(sent)
        else:
            for edge in (markup_sentence(sent.lower(), modifiers, targets).edges()):
                used = False
                if str(edge[0]).__contains__('historical'):
                    historical_to_add.append(sent)
                    used = True
                    break
                elif str(edge[0]).__contains__('future'):
                    risk_to_add.append(sent)
                    used = True
                    break
                elif str(edge[0]).__contains__('indication') or str(edge[0]).__contains__('ambivalent_existence'):
                    uncertain_existence_to_add.append(sent)
                    used = True
                    break
            if not used:
                definite_existence_to_add.append(sent)
    historical.append(historical_to_add)
    definite_existence.append(definite_existence_to_add)
    uncertain_existence.append(uncertain_existence_to_add)
    risk.append(risk_to_add)

In [None]:
df['MACE_historical'] = historical
df['MACE_current_existence'] = definite_existence
df['MACE_uncertain_existence'] = uncertain_existence
df['MACE_risk'] = risk

In [None]:
for i in range(100):
    print("all sentences extracted: " + str(df['relevant_sentences'][i]))
    print("all positive sentences extracted: " + str(df['only_positive'][i]))
    print("historical: " + str(df['MACE_historical'][i]))
    print("current existence: " + str(df['MACE_current_existence'][i]))
    print("uncertain existence: " + str(df['MACE_uncertain_existence'][i]))
    print("future risk: " + str(df['MACE_risk'][i]))
    print()

In [None]:
#ternary classification for validation
t_classification = []
for i in range(len(df)):
    if len(df['MACE_current_existence'][i]) > 0:
        t_classification.append(1)
    elif len(df['MACE_uncertain_existence'][i]) > 0:
        t_classification.append(-1)
    else:
        t_classification.append(0)
df['validation'] = t_classification

In [None]:
df.head()

In [None]:
#number of MACE sentences extracted before
#13988 additional sentences extracted after manual vocabulary expansion