In [7]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

import re
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

from matplotlib import pyplot as plt


import json
import pinecone

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BILAL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import sys
sys. version

'3.10.9 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:15) [MSC v.1916 64 bit (AMD64)]'

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Mean Pooling - Take attention mask into account for correct averaging
def meanpooling(output, mask):
    embeddings = output[0] # First element of model_output contains all token embeddings
    mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)


In [4]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("neuml/pubmedbert-base-embeddings")
model = AutoModel.from_pretrained("neuml/pubmedbert-base-embeddings")

def create_description_embeddings(description):


    # Tokenize sentences
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        output = model(**inputs)

    # Perform pooling. In this case, mean pooling.
    embeddings = meanpooling(output, inputs['attention_mask'])
    
    return embeddings

In [5]:
medical_stopwords = stopwords.words("english")


medical_stopwords.extend(['speaking', 'none', 'time', 'flush'])

In [6]:
def process_clinical_note(clinical_note):
    # Define the sections to remove
    sections_to_remove = [
        "Name:",
        "Unit No:",
        "Admission Date:",
        "Discharge Date:",
        "Date of Birth:",
        "Sex:",
        "Service:",
        "Allergies:",
        "Attending:",
        "Past Medical History:",
        "Social History:",
        "Family History:",
        "Vitals:",
        "Pertinent Results:",
        "Medications on Admission:",
        "Discharge Medications:",
        "Discharge Disposition:",
        "Discharge Condition:",
        "Discharge Instructions:",
        "Followup Instructions:"
    ]

    # Split the clinical note into lines
    lines = clinical_note.split('\n')

    # Initialize the processed note
    processed_note = []

    # Flag to exclude lines within unwanted sections
    exclude_section = False

    # Iterate through the lines and filter unwanted sections
    for line in lines:
        if any(section in line for section in sections_to_remove):
            exclude_section = True
        elif line.strip() == "":
            # Empty lines separate sections, so reset the flag
            exclude_section = False

        if not exclude_section:
            processed_note.append(line)

    # Join the lines to create the final note
    final_note = '\n '.join(processed_note)
    
    sections_to_remove = [
        r'chief complaint',
        r'history of present illness',
        r'Major Surgical or Invasive Procedure',
        r'physical exam',
        r'brief hospital course',
        r'Discharge',
        
        r'completed by',
    ]
    
    for pattern in sections_to_remove:
        final_note = re.sub(pattern, '', final_note, flags=re.IGNORECASE)

    # Define patterns to identify negations
    negation_patterns = [
        r'no\s+\w+',
        r'not\s+\w+',
        r'did\s+not\s+have\s+\w+'
    ]
    
    # Filter out sentences with negations
    sentences = [sentence for sentence in final_note.split('\n') if not any(re.search(pattern, sentence, re.IGNORECASE) for pattern in negation_patterns)]

    # Remove keys and special characters
    cleaned_note = re.sub(r'\w+:', '', '\n'.join(sentences), flags=re.IGNORECASE)  # Remove keys (case-insensitive)
    cleaned_note = re.sub(r'[^a-zA-Z\s]', '', cleaned_note)  # Remove special characters
    # Tokenize the note into sentences based on '\n'
    sentences = [sentence.strip() for sentence in cleaned_note.split('\n') if sentence.strip()]

    # Remove stop words and empty sentences
    sentences = [
        ' '.join(word for word in sentence.split() if word.lower() not in medical_stopwords)
        for sentence in sentences
    ]
    sentences = [item for item in sentences if item != '']

    return sentences

In [8]:
## Storing API keys
pinecone_api_key = "24c6f012-a062-4e67-b4cd-b9ab9b555cd0"
pinecone_environment = "gcp-starter"


In [9]:
# First we import pinecone. If you have not installed it, install it first by running pip install pinecone
pinecone.init(pinecone_api_key, environment=pinecone_environment)
pinecone_index_name = "msproject"
# connect to the newly created index
index = pinecone.Index(pinecone_index_name)

In [10]:
sentences = ['Left ankle fracture',
 'ORIF left ankle',
 'yr old man fall stairs holding lumbar',
 'construction',
 'job resulting immediate L ankle pain ED ankle fracture',
 'surgery performed',
 'AOXO',
 'splint intact positive CSM',
 'Taken operating room underwent surgical fixation ankle',
 'well controlled Cleared go home without services',
 'left ankle fracture sp ORIF']

In [11]:
notes = create_description_embeddings(sentences)
notes_embeddings = [embedding.tolist() for embedding in notes]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
sentences_list = []
icd_10_code_list = []
description_list = []
score_list = []

for i in range(0, len(notes_embeddings)):
    similar = index.query(notes_embeddings[i], top_k=1, include_metadata=True)
    
    sentences_list.append(sentences[i])
    
    if similar['matches']:
        match = similar['matches'][0]
        icd_10_code_list.append(match['metadata']['icd_10_code'])
        description_list.append(match['metadata']['description'])
        score_list.append(match['score'])
    else:
        icd_10_code_list.append(None)
        description_list.append(None)
        score_list.append(None)

# Create a DataFrame
df_output = pd.DataFrame({
    'Sentences': sentences_list,
    'ICD_10_Code': icd_10_code_list,
    'Description': description_list,
    'Score': score_list
})



In [14]:
df_output

Unnamed: 0,Sentences,ICD_10_Code,Description,Score
0,Left ankle fracture,S82.,"Fracture of lower leg, including ankle",0.752511
1,ORIF left ankle,S97.,Crushing injury of ankle and foot,0.672783
2,yr old man fall stairs holding lumbar,W06.,Fall from bed,0.624991
3,construction,W13.,"Fall from, out of or through building or struc...",0.520577
4,job resulting immediate L ankle pain ED ankle ...,S97.,Crushing injury of ankle and foot,0.627952
5,surgery performed,Y83.,Surgical operation and other surgical procedur...,0.624054
6,AOXO,L01.,Impetigo,0.361797
7,splint intact positive CSM,S47.,Crushing injury of shoulder and upper arm,0.372391
8,Taken operating room underwent surgical fixati...,Z40.,Encounter for prophylactic surgery,0.53066
9,well controlled Cleared go home without services,Z53.,Persons encountering health services for speci...,0.420105


In [191]:
# Example codes (need to import dataset to use this, dont run it. Just for reference!)

final_notes_with_icd_codes[final_notes_with_icd_codes.hadm_id == 20053836]

Unnamed: 0,hadm_id,text,text_length,icd_code
25,20053836,\nName: ___ Unit No: ___\n ...,2491,I63.
26,20053836,\nName: ___ Unit No: ___\n ...,2491,N18.
27,20053836,\nName: ___ Unit No: ___\n ...,2491,J96.
28,20053836,\nName: ___ Unit No: ___\n ...,2491,J18.
29,20053836,\nName: ___ Unit No: ___\n ...,2491,B48.
30,20053836,\nName: ___ Unit No: ___\n ...,2491,E11.
31,20053836,\nName: ___ Unit No: ___\n ...,2491,D68.
32,20053836,\nName: ___ Unit No: ___\n ...,2491,E11.
33,20053836,\nName: ___ Unit No: ___\n ...,2491,I12.
34,20053836,\nName: ___ Unit No: ___\n ...,2491,T85.
