# Clinical NER Data Preprocessing

This notebook processes the MACCROBAT2018 dataset and converts it from JSON format to BIO format for NER training.

In [None]:
import os
import re
import json
import shutil
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

data_dir = "../data"
raw_data_dir = os.path.join(data_dir, "raw")
processed_data_dir = os.path.join(data_dir, "processed")

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)
os.makedirs(os.path.join(processed_data_dir, "bio_data"), exist_ok=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhinavdholi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhinavdholi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Entity Type Mapping

Define mapping between entity types and their acronyms.

In [2]:
entity_to_acronyms = {
    'Activity': 'ACT',
    'Administration': 'ADM',
    'Age': 'AGE',
    'Area': 'ARA',
    'Biological_attribute': 'BAT',
    'Biological_structure': 'BST',
    'Clinical_event': 'CLE',
    'Color': 'COL',
    'Coreference': 'COR',
    'Date': 'DAT',
    'Detailed_description': 'DET',
    'Diagnostic_procedure': 'DIA',
    'Disease_disorder': 'DIS',
    'Distance': 'DIS',
    'Dosage': 'DOS',
    'Duration': 'DUR',
    'Family_history': 'FAM',
    'Frequency': 'FRE',
    'Height': 'HEI',
    'History': 'HIS',
    'Lab_value': 'LAB',
    'Mass': 'MAS',
    'Medication': 'MED',
    'Nonbiological_location': 'NBL',
    'Occupation': 'OCC',
    'Other_entity': 'OTH',
    'Other_event': 'OTE',
    'Outcome': 'OUT',
    'Personal_background': 'PER',
    'Qualitative_concept': 'QUC',
    'Quantitative_concept': 'QUC',
    'Severity': 'SEV',
    'Sex': 'SEX',
    'Shape': 'SHA',
    'Sign_symptom': 'SIG',
    'Subject': 'SUB',
    'Texture': 'TEX',
    'Therapeutic_procedure': 'THP',
    'Time': 'TIM',
    'Volume': 'VOL',
    'Weight': 'WEI'
}

acronyms_to_entities = {v: k for k, v in entity_to_acronyms.items()}

In [None]:
def parse_ann_file(ann_file_path):
    """
    Parses an annotation file in the BRAT format.
    """
    annotations = []
    with open(ann_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() and line.startswith('T'):  # entity annotations start with T
                parts = line.strip().split('\t')
                if len(parts) >= 3:
                    ann_id = parts[0]
                    ann_info = parts[1].split()
                    
                    label = ann_info[0]
                    start = int(ann_info[1])
                    end = int(ann_info[-1])
                    
                    annotations.append({
                        'label': label,
                        'start': start,
                        'end': end
                    })
    return annotations

def load_maccrobat_data(raw_data_dir):
    """
    Loads MACCROBAT dataset from .txt and .ann files.
    """
    data = {}
    
    # gets all .txt files
    txt_files = [f for f in os.listdir(raw_data_dir) if f.endswith('.txt')]
    
    for txt_file in txt_files:
        doc_id = txt_file.split('.')[0]  # removes .txt extension to get document ID
        ann_file = f"{doc_id}.ann"
        
        # checks if corresponding .ann file exists
        if not os.path.exists(os.path.join(raw_data_dir, ann_file)):
            print(f"Warning: No annotation file found for {txt_file}")
            continue
        
        # reads text file
        with open(os.path.join(raw_data_dir, txt_file), 'r', encoding='utf-8') as f:
            text = f.read()
        
        # parses annotations
        annotations = parse_ann_file(os.path.join(raw_data_dir, ann_file))
        
        # adds to dataset
        data[doc_id] = {
            'text': text,
            'annotations': annotations
        }
    
    return data

try:
    data = load_maccrobat_data(os.path.join(raw_data_dir, "MACCROBAT2018"))
    print(f"Data loaded successfully with {len(data)} documents.")
except Exception as e:
    print(f"Error loading data: {e}")
    data = {}


Data loaded successfully with 200 documents.
Saved parsed data to ../data/raw/annotated_data.json


## Load Data

Load the JSON data file containing the annotated clinical notes.

In [None]:
# Loads the JSON data
# If you don't have the file, you need to place it in the raw_data_dir
json_file_path = os.path.join(raw_data_dir, "annotated_data.json")

try:
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    print(f"Data loaded successfully with {len(data)} documents.")
except FileNotFoundError:
    print(f"Error: The file {json_file_path} was not found.")
    print("Please ensure you have the MACCROBAT2018 dataset in the raw data directory.")
    data = {}

Data loaded successfully with 200 documents.


## Text Processing Functions

Define functions for processing text and creating BIO tags.

In [None]:
def remove_trailing_punctuation(token):
    """
    Removes trailing punctuation from a token.
    """
    while token and re.search(r'[^\w\s\']', token[-1]):
        token = token[:-1]
        
    return token

In [None]:
def split_text(text):
    """
    Split text into tokens, start-end ranges, and sentence breaks.
    """
    regex_match = r'[^\s\u200a\-\u2010-\u2015\u2212\uff0d]+'
    
    tokens = []
    start_end_ranges = []
    sentence_breaks = []
    
    start_idx = 0
    
    for sentence in text.split('\n'):
        words = [match.group(0) for match in re.finditer(regex_match, sentence)]
        processed_words = list(map(remove_trailing_punctuation, words))
        sentence_indices = [(match.start(), match.start() + len(token)) for match, token in
                          zip(re.finditer(regex_match, sentence), processed_words)]
        
        # updates the indices to account for the current sentence's position in the entire text
        sentence_indices = [(start_idx + start, start_idx + end) for start, end in sentence_indices]
        
        start_end_ranges.extend(sentence_indices)
        tokens.extend(processed_words)
        
        sentence_breaks.append(len(tokens))
        
        start_idx += len(sentence) + 1
    return tokens, start_end_ranges, sentence_breaks

In [None]:
def tag_token(tokens, tags, token_pos, entity):
    """
    Assign BIO tag to a token based on its position and entity type.
    """
    stop_words = set(stopwords.words('english'))
    
    tag = entity_to_acronyms[entity]
    
    if token_pos > 0 and f'{tag}' in tags[token_pos - 1]:
        tags[token_pos] = f'I-{tag}'
    elif tokens[token_pos].lower() not in stop_words:
        tags[token_pos] = f'B-{tag}'
    
    return tags

In [None]:
def write_bio_files(output_file_path, tokens, tags, sentence_breaks):
    """
    Write tokens and tags to a BIO format file.
    """
    with open(output_file_path, 'w') as f:
        for i in range(len(tokens)):
            token = tokens[i].strip()
            if token:
                if i in sentence_breaks:
                    f.write("\n")
                f.write(f"{tokens[i]}\t{tags[i]}\n")

## Convert Annotations to BIO Format

In [None]:
def convert_ann_to_bio(data, output_dir, filtered_entities=[]):
    
    """
    Convert annotations from a dictionary of text files to a BIO-tagged sequence.
    """
    
    if os.path.exists(output_dir):
        # deletes the contents of the directory
        shutil.rmtree(output_dir)
    # recreates the directory
    os.makedirs(output_dir)
    
    
    for file_id in data:
        text = data[file_id]['text']
        annotations = data[file_id]['annotations']
        
        # tokenizing
        tokens, token2text, sentence_breaks = split_text(text)

        # initializes the tags
        tags = ['O'] * len(tokens)

        ann_pos = 0
        token_pos = 0

        while ann_pos < len(annotations) and token_pos < len(tokens):

            label = annotations[ann_pos]['label']
            start = annotations[ann_pos]['start']
            end = annotations[ann_pos]['end']

            if filtered_entities:
                if label not in filtered_entities:
                    # increments to access next annotation
                    ann_pos += 1
                    continue
            
            ann_word = text[start:end]

            # finds the next word that fall between the annotation start and end
            while token_pos < len(tokens) and token2text[token_pos][0] < start:
                
                token_pos += 1

            if tokens[token_pos] == ann_word or \
                ann_word in tokens[token_pos] or \
                re.sub(r'\W+', '', ann_word) in re.sub(r'\W+', '', tokens[token_pos]):
                tag_token(tokens, tags, token_pos, label)
            elif ann_word in tokens[token_pos - 1] or \
                ann_word in tokens[token_pos - 1] or \
                re.sub(r'\W+', '', ann_word) in re.sub(r'\W+', '', tokens[token_pos - 1]):
                tag_token(tokens, tags, token_pos - 1, label)
            else:
                print(tokens[token_pos], tokens[token_pos - 1], ann_word, label)

            # increments to access next annotation
            ann_pos += 1

        # writes to bio file
        write_bio_files(os.path.join(output_dir, f"{file_id}.bio"), tokens, tags, sentence_breaks)
    print("Conversion complete")

In [None]:
# tests the conversion function on one sample document
if data:
    sample_doc_id = next(iter(data))
    sample_doc = data[sample_doc_id]
    tokens, token2text, sentence_breaks = split_text(sample_doc['text'][:200])
    print(f"Sample tokens: {tokens[:10]}")
    print(f"Sample token positions: {token2text[:10]}")
    print(f"Sample annotations: {sample_doc['annotations'][:3]}")

Sample tokens: ['Our', '24', 'year', 'old', 'non', 'smoking', 'male', 'patient', 'presented', 'with']
Sample token positions: [(0, 3), (4, 6), (7, 11), (12, 15), (16, 19), (20, 27), (28, 32), (33, 40), (41, 50), (51, 55)]
Sample annotations: [{'label': 'Age', 'start': 4, 'end': 6}, {'label': 'Age', 'start': 7, 'end': 11}, {'label': 'Age', 'start': 12, 'end': 15}]


In [None]:
# convert all documents to BIO format
bio_output_dir = os.path.join(processed_data_dir, "bio_data")

if data:
    convert_ann_to_bio(data, bio_output_dir)
    print(f"BIO files saved to {bio_output_dir}")

Conversion complete
BIO files saved to ../data/processed/bio_data


## Check Converted Data

In [None]:
# checks a sample BIO file
import glob
bio_output_dir = os.path.join(processed_data_dir, "bio_data")
bio_files = glob.glob(os.path.join(bio_output_dir, "*.bio"))
if bio_files:
    with open(bio_files[0], 'r') as f:
        sample_content = f.read(500)  # read first 500 characters
    print(f"Sample BIO file content:\n{sample_content}")
    print(f"Total BIO files created: {len(bio_files)}")

Sample BIO file content:
A	O
36	B-AGE
yr	I-AGE
old	I-AGE
previously	B-HIS
healthy	I-HIS
Sri	B-PER
Lankan	I-PER
male	B-SEX
who	O
takes	B-OCC
care	I-OCC
of	I-OCC
a	I-OCC
horse	I-OCC
presented	B-CLE
to	O
the	O
medical	B-NBL
casualty	I-NBL
ward	I-NBL
with	O
fever	B-SIG
arthralgia	I-SIG
and	O
myalgia	B-SIG
for	O
one	B-DUR
day	I-DUR

He	O
complained	O
of	O
mild	B-SEV
dysuria	B-SIG
but	O
had	O
normal	B-LAB
urine	B-DIA
output	I-DIA

He	O
did	O
not	O
have	O
chest	B-BST
pain	B-SIG
or	O
shortness	B-SIG
of	I-SIG
breath	I-SIG

Furth
Total BIO files created: 200


## Save Entity Mappings

Save entity mappings for later use in training and inference.

In [None]:
# saves entity mappings for later use
mapping_file = os.path.join(processed_data_dir, "entity_mappings.json")
with open(mapping_file, 'w') as f:
    json.dump({
        "entity_to_acronyms": entity_to_acronyms,
        "acronyms_to_entities": acronyms_to_entities
    }, f)

print(f"Entity mappings saved to {mapping_file}")

Entity mappings saved to ../data/processed/entity_mappings.json


## Summary

In this notebook, we've:
1. Set up the necessary directories
2. Loaded the MACCROBAT2018 dataset
3. Defined functions for processing text and creating BIO tags
4. Converted the JSON annotations to BIO format
5. Saved entity mappings for use in training and inference

The processed data is now ready for model training.