In [None]:
import os
os.environ['MKL_NUM_THREAD'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from tokenizers import ByteLevelBPETokenizer

import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# Paths and Config

In [None]:
data_dir = './data/'

data_path = os.path.join(data_dir, "<data_file>")
doc_id_column = "id"
doc_text_column = "description"

model_dir = '../medcat_models/'
vocab_path = model_dir + 'vocab/vocab.dat'
cdb_path = model_dir + 'cdb/<model_file>'

filter_path = None

ann_folder_path = os.path.join(data_dir, f'annotated_docs')
if not os.path.exists(ann_folder_path):
    os.makedirs(ann_folder_path)
    
save_path_annotations_per_doc = os.path.join(ann_folder_path, "<output_filename>.json")


# Load CDB and Vocab

In [None]:
cdb = CDB.load(cdb_path)
vocab = Vocab.load(vocab_path)

In [None]:
config should already be set....
"""
# Configure some parameters
cdb.config.ner['min_name_len'] = 2
cdb.config.ner['upper_case_limit_len'] = 3
cdb.config.general['spell_check'] = True
cdb.config.linking['train_count_threshold'] = 10
cdb.config.linking['similarity_threshold'] = 0.3
cdb.config.linking['train'] = False
cdb.config.linking['disamb_length_limit'] = 5
cdb.config.general['full_unlink'] = True
cdb.config.general['spacy_model'] = 'en_core_sci_lg'"""

# Create MedCAT pipeline

In [None]:
cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)


# Annotate

In [None]:
cat.cdb.print_stats()

In [None]:
df = pd.read_csv(data_path)[[doc_id_column, doc_text_column]]


In [None]:
%%time
batch_size = 1000
batch = []
cnt = 0
results = []
for id, row in df.iterrows():
    text = row[doc_text_column]
    # Skip text if under 10 characters
    if len(str(text)) > 10:
        batch.append((row[doc_id_column], text))
    else:
        batch.append((row[doc_id_column], []))
    
    if len(batch) > batch_size or id == len(df) - 1:
        # Update the number of processors depending on your machine.
        result = cat.multiprocessing(batch, nproc=2)
        results.extend(result)
        cnt += 1
        print("Done: {} - rows".format((cnt-1)* batch_size + len(batch)-1))
        
        # Reset the batch
        batch = []

In [None]:
# Double check nothing is missed
assert len(results)+len(skipped_docs) == len(df)

In [None]:
# Save to file (docs is docs 2 annotations)
json.dump(results, open(save_path_annotations_per_doc, "w"))

### Inspect the model

In [None]:
text = "He was diagnosed with heart failure"
doc = cat(text)
print(doc.ents)

In [None]:
# Display Snomed codes
for ent in doc.ents:
    print(ent, " - ", ent._.cui, " - ", cdb.cui2preferred_name[ent._.cui])

In [None]:
# To show semantic types for each entity
for ent in doc.ents:
    print(ent, " - ", cdb.cui2type_ids.get(ent._.cui))

In [None]:
# Display
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

# Alternative approach

In [None]:
## This works too but not multiprocessing

docs = {}
print(f"Len of df: {len(df)}") 

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    text = str(row[doc_text_column])
    
    # Skip text if under 10 characters,
    if len(text) > 10:
        docs[row[doc_id_column]] = cat.get_entities(text)
    else:
        docs[row[doc_id_column]] = []
        

In [None]:
cat.cdb.print_stats()

In [None]:
# Save to file (docs is docs 2 annotations)
json.dump(docs, open(save_path_annotations_per_doc, "w"))
