In [None]:
!pip install datasets

In [4]:
import utils
from utils import parse_xmi, replace_ner_tags_with_integers, assign_ner_tags, format_dataset, tokenize_and_align_labels, tokenizer
from lxml import etree
from datasets import Dataset, load_from_disk, concatenate_datasets

In [12]:
# Load an XMI file and parse
file_path = '/content/drive/MyDrive/Colab Notebooks/underwriteme-data/data/Lashaun800_Runte676_0c4a29fe-bc92-ee21-4fec-b88c8474f9c2.xmi'

tree = etree.parse(file_path)
root = tree.getroot()

# Extract namespaces directly from the parsed XMI
namespaces = {key: value for key, value in root.nsmap.items() if key}

In [21]:
# Extract the sample document text
sofa_string = root.find('.//cas:Sofa', namespaces).get('sofaString')
print(sofa_string[0:900])

Medical Attendants Report
__________________
_______________________________
_________
Patient Details
Name:                    LashaunRunte
Sex:                     F
NHS Number:              7016321016
Date of Birth:           03-Nov-2015
Address:                 839HaleyPromenadeApt25
                         EastBrookfield
                         WorcesterCounty
                         Massachusetts
                         01515
GP registration date:    03-Nov-2015
Date records commencing: 03-Nov-2015
__________________
_______________________________
_________
__________________
_______________________________
_________
Problems (Active)
	Date	Description	Value / Units	Range	Associated Text
	03-Nov-2015	Chestpain
	03-Nov-2015	Chest-Pain
	03-Nov-2015	Chest Pain
	03-Nov-2015	Essential hypertension
	03-Nov-2015	ECG: sinus rhythm
	03-Nov-2015	ECG SR
	03-Nov-2015	ECG sinus rhythm
	03-


In [8]:
# Extract entity types from a document.
'''entity_list = []
for sentence in sentences:
    sentence_text = sofa_string[int(sentence.get('begin')):int(sentence.get('end'))]
    #print(f"Sentence: {sentence_text}")

    # Find annotations within this sentence
    for annotation in root.findall('.//custom:TextMiningAnnotation', namespaces):
        ann_begin = int(annotation.get('begin'))
        ann_end = int(annotation.get('end'))
        #annotation_text = sofa_string[int(annotation.get('begin')):int(annotation.get('end'))]
        #print(f"Annotation: {annotation_text}")
        #print((annotation.get('EntityType')))

        # Check if the annotation is within the current sentence
        if ann_begin >= int(sentence.get('begin')) and ann_end <= int(sentence.get('end')):
            entity_text = sofa_string[ann_begin:ann_end]
            entity_type = annotation.get('EntityType')
            entity_list.append(entity_type)
            #print(f"  Entity: {entity_text}, Type: {entity_type}")

unique_entity_list = list(set(entity_list))
print(unique_entity_list)'''

['HealthcareProvider',
 'Date',
 'Drug',
 'Sign',
 'Symptom',
 'Investigation',
 'LabResult',
 'Behaviour',
 'LabTest',
 'Treatment',
 'Unit',
 'Condition']

In [4]:
# Hard coded entity list to save processing time
'''unique_entity_list = ['Investigation',
 'Treatment',
 'LabTest',
 'Condition',
 'Behaviour',
 'Date',
 'Unit',
 'Sign',
 'Symptom',
 'Drug',
 'LabResult',
 'HealthcareProvider']'''

In [27]:
# Parse sentences, tokens, entities from a XMI file
sentences, tokens, entities = parse_xmi(file_path)
# Update tokens with their NER tags
tokens = assign_ner_tags(tokens, entities)
# Create the dataset
dataset = format_dataset(sentences, tokens)
# Apply the function to the entire dataset
dataset = [replace_ner_tags_with_integers(item) for item in dataset]

In [29]:
# Check tokenized input ids, tokenised tokens
example_text = dataset[165]
print(example_text)

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)
print(tokenized_input)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

word_ids = tokenized_input.word_ids()
print(word_ids)

{'id': '5832', 'tokens': ['08-Dec-2015', ' - ', 'Dr. Debra128 Predovic534', ' , ', 'WEST BROOKFIELD FAMILY PRACTICE', ' @ ', '46 NORTH MAIN ST'], 'ner_tags': [9, 0, 0, 0, 0, 0, 0]}
{'input_ids': [101, 4775, 118, 13063, 118, 1410, 118, 1987, 119, 3177, 6766, 11964, 1604, 11689, 2572, 15901, 24239, 1527, 117, 160, 9919, 1942, 26660, 2346, 22027, 17675, 21678, 2137, 6820, 14038, 2162, 3663, 11629, 8101, 21669, 10954, 137, 3993, 24819, 10460, 3048, 9960, 11607, 23676, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', '08', '-', 'Dec', '-', '2015', '-', 'Dr', '.', 'De', '##bra', '##12', '##8', 'Pre', '##do', '##vic', '##53', '##4', ',', 'W', '##ES', '##T', 'BR', '##O', '##OK', '##FI', '##EL', '##D', 'FA', '##MI', '##L', '##Y', 'PR', '##AC', '##TI', '##CE', '@', '46', 'NO', '##RT', '##H', 'MA', '##IN', 'ST', '[SEP]']
[None, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

In [30]:
# Sub-Token - The input ids returned by the tokenizer are longer than the lists of labels our dataset contain.
len(example_text['ner_tags']), len(tokenized_input["input_ids"])

(7, 45)

In [31]:
# Adjusted function call for a single example
single_example = {
    'tokens': [dataset[165]['tokens']],  # Wrap in another list
    'ner_tags': [dataset[165]['ner_tags']]  # Wrap in another list
}
q = tokenize_and_align_labels(single_example, tokenizer)
print(q)
len(q['labels'][0]), len(q["input_ids"][0])

# after applying tokenize_and_align_labels() we have 'labels' key

{'input_ids': [[101, 4775, 118, 13063, 118, 1410, 118, 1987, 119, 3177, 6766, 11964, 1604, 11689, 2572, 15901, 24239, 1527, 117, 160, 9919, 1942, 26660, 2346, 22027, 17675, 21678, 2137, 6820, 14038, 2162, 3663, 11629, 8101, 21669, 10954, 137, 3993, 24819, 10460, 3048, 9960, 11607, 23676, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 9, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}


(45, 45)

In [32]:
# Applying alignment operation on entire data
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<30} {label}")

[CLS]_________________________ -100
08____________________________ 9
-_____________________________ 10
Dec___________________________ 10
-_____________________________ 10
2015__________________________ 10
-_____________________________ 0
Dr____________________________ 0
._____________________________ 0
De____________________________ 0
##bra_________________________ 0
##12__________________________ 0
##8___________________________ 0
Pre___________________________ 0
##do__________________________ 0
##vic_________________________ 0
##53__________________________ 0
##4___________________________ 0
,_____________________________ 0
W_____________________________ 0
##ES__________________________ 0
##T___________________________ 0
BR____________________________ 0
##O___________________________ 0
##OK__________________________ 0
##FI__________________________ 0
##EL__________________________ 0
##D___________________________ 0
FA____________________________ 0
##MI__________________________ 0
##L

In [36]:
# Convert list of dictionaries to dictionary of lists
dataset_dict = {key: [dic[key] for dic in dataset] for key in dataset[0]}

# Now convert this dictionary of lists into a Hugging Face Dataset format
hf_dataset = Dataset.from_dict(dataset_dict)

# Check the example item
print(hf_dataset[165])

# Apply the tokenize_and_align_labels function using map
tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)
print(tokenized_dataset[165])
print(tokenized_dataset)

{'id': '5832', 'tokens': ['08-Dec-2015', ' - ', 'Dr. Debra128 Predovic534', ' , ', 'WEST BROOKFIELD FAMILY PRACTICE', ' @ ', '46 NORTH MAIN ST'], 'ner_tags': [9, 0, 0, 0, 0, 0, 0]}


Map:   0%|          | 0/6223 [00:00<?, ? examples/s]

{'id': '5832', 'tokens': ['08-Dec-2015', ' - ', 'Dr. Debra128 Predovic534', ' , ', 'WEST BROOKFIELD FAMILY PRACTICE', ' @ ', '46 NORTH MAIN ST'], 'ner_tags': [9, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 4775, 118, 13063, 118, 1410, 118, 1987, 119, 3177, 6766, 11964, 1604, 11689, 2572, 15901, 24239, 1527, 117, 160, 9919, 1942, 26660, 2346, 22027, 17675, 21678, 2137, 6820, 14038, 2162, 3663, 11629, 8101, 21669, 10954, 137, 3993, 24819, 10460, 3048, 9960, 11607, 23676, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 9, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]}
Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6223
})


In [None]:
# Save tokenized dataset obtained from a single file to disk
tokenized_dataset.save_to_disk('/content/train_dataset_part4')

Saving the dataset (0/1 shards):   0%|          | 0/6223 [00:00<?, ? examples/s]

In [None]:
# Reload the dataset from the directory
reloaded_dataset4 = load_from_disk('/content/train_dataset_part4')

# Concatenate datasets (Sentence ids should be unique!)
#combined_train_dataset = concatenate_datasets([reloaded_dataset1, reloaded_dataset2, reloaded_dataset3, reloaded_dataset4])

# Save combined dataset to disk
#combined_train_dataset.save_to_disk('/content/combined_train_dataset')