In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "emilyalsentzer/Bio_ClinicalBERT"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Dataset Loading

In [2]:
import os

# Directory where .txt files are stored
directory = 'F:/Aswin/01 epita/semester 3/Action Learning/Project testing/Dataset/MIMIC - III Dataset/training_20180910/training_20180910'

# Initialize a list to store text from each file
clinical_notes = []

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            text = file.read()
            clinical_notes.append(text)


In [3]:
print(clinical_notes[0])

Admission Date:  [**2115-2-22**]              Discharge Date:   [**2115-3-19**]

Date of Birth:  [**2078-8-9**]             Sex:   M

Service: MEDICINE

Allergies:
Vicodin

Attending:[**First Name3 (LF) 4891**]
Chief Complaint:
Post-cardiac arrest, asthma exacerbation

Major Surgical or Invasive Procedure:
Intubation
Removal of chest tubes placed at an outside hospital
R CVL placement


History of Present Illness:
Mr. [**Known lastname 3234**] is a 36 year old gentleman with a PMH signifciant
with dilated cardiomyopathy s/p AICD, asthma, and HTN admitted
to an OSH with dyspnea now admitted to the MICU after PEA arrest
x2. The patient initially presented to LGH ED with hypoxemic
respiratory distress. While at the OSH, he received CTX,
azithromycin, SC epinephrine, and solumedrol. While at the OSH,
he became confused and subsequently had an episode of PEA arrest
and was intubated. He received epinephrine, atropine, magnesium,
and bicarb. In addition, he had bilateral needle thoracostomie

In [5]:
import re
import os

from spacy import displacy
from spacy.matcher import Matcher
import spacy


# Initialize spaCy and load English tokenizer and NER model
nlp = spacy.load("en_core_web_sm")

# Function to extract dates from text using regex
def extract_dates(text):
    dates = []
    # Define regex pattern for dates (you may need to adjust this based on your date formats)
    date_pattern = r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(?:\d{1,2},\s+)?(?:\d{4})'
    matches = re.findall(date_pattern, text)
    for match in matches:
        dates.append(match)
    return dates

# Function to extract demographic information (age and sex)
def extract_demographics(text):

    sex = None
    # Example regex patterns for age and sex (you may need to adapt based on your data format)

    sex_pattern = r'\b(?:M|F)\b'
    sex_matches = re.findall(sex_pattern, text)

    if sex_matches:
        sex = sex_matches[0]
    return  sex

# Function to extract chief complaints
def extract_chief_complaints(text):
    chief_complaints = []
    # Example regex pattern for chief complaints (you may need to customize)
    chief_complaint_pattern = r'Chief Complaint:(.*?)(?=History of Present Illness:|$)'
    matches = re.findall(chief_complaint_pattern, text, re.DOTALL)
    for match in matches:
        chief_complaints.append(match.strip())
    return chief_complaints

# Function to extract medications
def extract_medications(text):
    medications = []
    # Example regex pattern for medications (you may need to customize)
    medication_pattern = r'Medications:(.*?)(?=Discharge Medications:|$)'
    matches = re.findall(medication_pattern, text, re.DOTALL)
    for match in matches:
        medications.append(match.strip())
    return medications

# Function to extract diagnostic results
def extract_diagnostic_results(text):
    diagnostic_results = []
    # Example regex pattern for diagnostic results (you may need to customize)
    diagnostic_pattern = r'Diagnostic Results:(.*?)(?=Discharge Instructions:|$)'
    matches = re.findall(diagnostic_pattern, text, re.DOTALL)
    for match in matches:
        diagnostic_results.append(match.strip())
    return diagnostic_results

# Main function to process clinical notes and extract features
def process_clinical_notes(clinical_notes_dir):
    extracted_data = []
    for filename in os.listdir(clinical_notes_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(clinical_notes_dir, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                # Extract features
                dates = extract_dates(text)
                sex = extract_demographics(text)
                chief_complaints = extract_chief_complaints(text)
                medications = extract_medications(text)
                diagnostic_results = extract_diagnostic_results(text)
                # Collect extracted data
                extracted_data.append({
                    'Filename': filename,
                    'Dates': dates,
                    'Sex': sex,
                    'Chief Complaints': chief_complaints,
                    'Medications': medications,
                    'Diagnostic Results': diagnostic_results
                })
    return extracted_data

# Example usage:
if __name__ == '__main__':
    clinical_notes_dir = 'F:/Aswin/01 epita/semester 3/Action Learning/Project testing/Dataset/MIMIC - III Dataset/training_20180910/training_20180910'
    extracted_data = process_clinical_notes(clinical_notes_dir)
    for data in extracted_data:
        print(f"Filename: {data['Filename']}")
        print(f"Dates: {data['Dates']}")
        print(f"Sex: {data['Sex']}")
        print(f"Chief Complaints: {data['Chief Complaints']}")
        print(f"Medications: {data['Medications']}")
        print(f"Diagnostic Results: {data['Diagnostic Results']}")
        print("\n")


Filename: 100035.txt
Dates: []
Age: 2, Sex: M
Chief Complaints: ['Post-cardiac arrest, asthma exacerbation\n\nMajor Surgical or Invasive Procedure:\nIntubation\nRemoval of chest tubes placed at an outside hospital\nR CVL placement']
Medications: ['1. bisacodyl 5 mg Tablet, Delayed Release (E.C.) Sig: Two (2)\nTablet, Delayed Release (E.C.) PO DAILY (Daily) as needed for\nConstipation.\n2. senna 8.6 mg Tablet Sig: One (1) Tablet PO BID (2 times a\nday) as needed for Constipation.\n3. acetaminophen 325 mg Tablet Sig: Two (2) Tablet PO Q6H (every\n6 hours) as needed for pain/fever.\n4. carvedilol 12.5 mg Tablet Sig: Two (2) Tablet PO BID (2 times\na day).\n5. docusate sodium 100 mg Capsule Sig: One (1) Capsule PO BID (2\ntimes a day).\n6. furosemide 40 mg Tablet Sig: Two (2) Tablet PO BID (2 times a\nday).\n7. lisinopril 10 mg Tablet Sig: Two (2) Tablet PO DAILY (Daily).\n\n8. olanzapine 5 mg Tablet, Rapid Dissolve Sig: [**11-25**] Tablet, Rapid\nDissolves PO QHS (once a day (at bedtime))