In [None]:
import utils
from utils import parse_xmi, extract_entities

from lxml import etree
from transformers import DistilBertTokenizerFast
from transformers import AutoModelForTokenClassification
from transformers import pipeline
from itertools import chain
import json

In [3]:
# Load input data
tree = etree.parse('/content/drive/MyDrive/Colab Notebooks/underwriteme-data/data/Lashaun800_Runte676_0c4a29fe-bc92-ee21-4fec-b88c8474f9c2.xmi')
root = tree.getroot()

# Extract namespaces directly from the parsed XML
namespaces = {key: value for key, value in root.nsmap.items() if key}

In [4]:
# Load the model, tokenizer and NER pipeline
model_fine_tuned = AutoModelForTokenClassification.from_pretrained('/content/drive/MyDrive/Colab Notebooks/underwriteme-data/ner_model2')
ner_tokenizer = DistilBertTokenizerFast.from_pretrained("/content/drive/MyDrive/Colab Notebooks/underwriteme-data/ner_tokenizer2")
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=ner_tokenizer)

In [None]:
# Extract the document text
sofa_string = root.find('.//cas:Sofa', namespaces).get('sofaString')

# Process each sentence and extract entities
entities = []
sentences = root.findall('.//type2:Sentence', namespaces)

# Generate a list of lists of extracted entities
entities_lists = [extract_entities(nlp(sofa_string[int(sentence.get('begin')):int(sentence.get('end'))]))
                  for sentence in sentences]

# Flatten the list of lists into a single list
entities = list(chain.from_iterable(entities_lists))


[{'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Symptom', 'word': 'Ch'}, {'entity': 'Behaviour', 'word': 'estpain'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Symptom', 'word': 'Chest - Pain'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Symptom', 'word': 'Chest Pain'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Condition', 'word': 'Essential hypertension'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Investigation', 'word': 'ECG : sinus rhythm'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Investigation', 'word': 'ECG SR'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Investigation', 'word': 'ECG sinus rhythm'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Condition', 'word': 'Pure hypercholesterolaemia'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Condition', 'word': 'Asthma'}, {'entity': 'Date', 'word': '03 - Nov - 2015'}, {'entity': 'Condition', 'wor

In [None]:
# Define the filename where you want to save the entities
filename = '/content/drive/MyDrive/Colab Notebooks/underwriteme-data/data/extracted_entities.json'

# Output the entities list to a JSON file
with open(filename, 'w') as file:
    json.dump(entities, file, indent=4)

print(f"Entities successfully saved to {filename}.")

Entities successfully saved to /content/drive/MyDrive/Colab Notebooks/underwriteme-data/data/extracted_entities.json.
