In [None]:
import os

import pandas as pd

from bert_deid.model import Transformer

### Load in model

In [None]:
# load in a trained model
model_type = 'bert'
model_path = '/data/models/bert-i2b2-2014'

# load in a trained model
deid_model = Transformer(
    model_type, model_path, device='cpu'
)

In [None]:

text = 'Dr. Somayah says I have had a pneumothorax since 2019-01-01.'
print(deid_model.apply(text, repl='___'))

In [None]:

# we can also get the original predictions
preds, lengths, offsets = deid_model.predict(text)

# print out the identified entities
for p in range(preds.shape[0]):
    start, stop = offsets[p], offsets[p] + lengths[p]

    # most likely prediction
    idxMax = preds[p].argmax()
    label = deid_model.label_set.id_to_label[idxMax]
    print(f'{text[start:stop]:15s} {label}')

### Load in text files

In [None]:
base_path = 'tests/fake-data/radiology-reports/txt'
reports_list = os.listdir(base_path)
reports_list.sort()

reports = {}
for f in reports_list:
    with open(os.path.join(base_path, f), 'r') as fp:
        reports[f] = ''.join(fp.readlines())

### Run bert-deid

In [None]:
anns_bert = {}
for f in reports_list:
    print(f'De-identifying {f}')
    
    text = reports[f]
    
    # ann with bert
    preds, lengths, offsets = deid_model.predict(text)
    
    entities = [
        deid_model.label_set.id_to_label[x]
        for x in np.argmax(preds, axis=1)
    ]
    
    idxPHI = np.where(entities != 'O')[0]
    print(f'\n{f} PHI annotations:')
    for i in idxPHI:
        # print some context with PHI
        start = max(offsets[i] - 20, 0)
        stop = min(offsets[i] + lengths[i] + 20, len(text))
        
        print(text[start:offsets[i]], end='')
        print(f'**{text[offsets[i]:offsets[i]+lengths[i]]}**', end='')
        print(text[offsets[i]+lengths[i]:stop])    