In [1]:
import os

import pandas as pd

from bert_deid import model as bert_deid_model
from pydeid import annotation

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### Load in model

In [2]:
# Load a trained model
bert_model = bert_deid_model.BertForDEID(
    model_dir="/db/git/bert-deid/models/i2b2_2014"
)
# bert_model.to('cpu')
# bert_model.eval()

Loading model and configuration from /db/git/bert-deid/models/i2b2_2014.


07/26/2019 13:27:37 - INFO - bert_deid.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/alistairewj/.pytorch_pretrained_bert/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1


### Load in text files

In [4]:
base_path = 'tests/fake-data/radiology-reports/txt'
reports_list = os.listdir(base_path)
reports_list.sort()

reports = {}
for f in reports_list:
    with open(os.path.join(base_path, f), 'r') as fp:
        reports[f] = ''.join(fp.readlines())

# BERT deid

### Run bert-deid

In [5]:
anns_bert = {}
for f in reports_list:
    print(f'De-identifying {f}')
    
    text = reports[f]
    
    # ann with bert
    ann = bert_model.annotate(
        text, document_id=f)

    # merge intervals that are close together
    ann = annotation.merge_intervals(
        ann, dist=1, text=text)

    # post-fix to reduce false positives
    ann = bert_model.postfix(ann, text)
    
    # add to dict
    anns_bert[f] = ann
    
    print('First 5 rows of deid:')
    display(ann.head())
    

De-identifying 500.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
23,500.txt,bert.1.85,bert-base-cased,528,530,70,AGE,,9.473902
21,500.txt,bert.0.53,bert-base-cased,339,344,-0500,CONTACT,,5.12474
49,500.txt,bert.7.57,bert-base-cased,2071,2076,-0500,CONTACT,,5.286543
55,500.txt,bert.9.22,bert-base-cased,2408,2422,(617) 555-1234,CONTACT,,6.621456
5,500.txt,bert.0.15,bert-base-cased,143,151,01/02/40,DATE,,10.2024


De-identifying 501.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
21,501.txt,bert.2.10,bert-base-cased,530,532,96,AGE,,9.324322
22,501.txt,bert.3.50,bert-base-cased,801,803,96,AGE,,8.499738
27,501.txt,bert.5.31,bert-base-cased,1549,1563,(617) 555-1234,CONTACT,,7.454946
6,501.txt,bert.0.14,bert-base-cased,143,151,14/02/13,DATE,,10.328542
14,501.txt,bert.0.30,bert-base-cased,247,255,01/05/10,DATE,,10.332378


De-identifying 502.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
24,502.txt,bert.1.83,bert-base-cased,509,511,65,AGE,,8.817659
22,502.txt,bert.0.50,bert-base-cased,340,341,-,CONTACT,,4.759385
32,502.txt,bert.4.49,bert-base-cased,1390,1404,(617) 555-1234,CONTACT,,7.381968
6,502.txt,bert.0.14,bert-base-cased,143,151,01/01/45,DATE,,10.105477
16,502.txt,bert.0.32,bert-base-cased,247,255,02/14/10,DATE,,9.821585


De-identifying 503.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
20,503.txt,bert.1.75,bert-base-cased,500,502,65,AGE,,9.378055
27,503.txt,bert.4.37,bert-base-cased,1044,1046,65,AGE,,8.569704
18,503.txt,bert.0.48,bert-base-cased,340,345,-0503,CONTACT,,6.553461
51,503.txt,bert.7.52,bert-base-cased,2171,2185,(617) 555-1234,CONTACT,,5.158348
5,503.txt,bert.0.14,bert-base-cased,143,151,01/01/45,DATE,,10.535706


De-identifying 504.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
19,504.txt,bert.1.74,bert-base-cased,489,491,74,AGE,,9.475336
21,504.txt,bert.3.11,bert-base-cased,779,781,74,AGE,,8.938679
16,504.txt,bert.0.49,bert-base-cased,337,345,000-0504,CONTACT,,4.797463
35,504.txt,bert.5.33,bert-base-cased,1743,1757,(617) 555-1234,CONTACT,,6.334353
6,504.txt,bert.0.15,bert-base-cased,143,151,06/26/36,DATE,,10.342719


De-identifying 505.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
25,505.txt,bert.2.3,bert-base-cased,553,555,62,AGE,,9.540651
22,505.txt,bert.0.50,bert-base-cased,340,345,-0505,CONTACT,,6.708478
34,505.txt,bert.5.38,bert-base-cased,1729,1743,(617) 555-1234,CONTACT,,6.706555
6,505.txt,bert.0.14,bert-base-cased,143,151,09/15/47,DATE,,10.788244
16,505.txt,bert.0.32,bert-base-cased,247,255,07/19/10,DATE,,10.313417


De-identifying 506.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
22,506.txt,bert.1.71,bert-base-cased,529,531,29,AGE,,8.654554
25,506.txt,bert.3.2,bert-base-cased,860,862,29,AGE,,8.835622
19,506.txt,bert.0.45,bert-base-cased,337,345,000-0506,CONTACT,,6.489229
40,506.txt,bert.4.36,bert-base-cased,1449,1463,(617) 632-7234,CONTACT,,5.441032
5,506.txt,bert.0.14,bert-base-cased,143,151,01/01/60,DATE,,9.933118


### (Optional) Annotate manual text

In [7]:
text = 'Patient is a 64 yo man with a serious condition.'

# ann with bert
df = bert_model.annotate(text, document_id='0')

# merge intervals that are close together
df = annotation.merge_intervals(df, dist=1, text=text)

# post-fix to reduce false positives
df = bert_model.postfix(df, text)

display(df)

Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,0,bert.0.3,bert-base-cased,13,15,64,AGE,,9.272903


# pydeid

In [8]:
from pydeid import annotator, metrics

modules = ['age', 'name', 'date',
           'location', 'telephone', 'initials',
           'rr',
           'crossreference']

# initialize deid module
mdl = annotator.Pattern(modules=modules)

In [9]:
anns_pydeid = {}
for f in reports_list:
    print(f'De-identifying {f}')
    text = reports[f]

    ann = mdl.annotate(text, document_id=f)
    
    anns_pydeid[f] = ann
    
    print('First 5 rows of deid:')
    display(ann.head())

De-identifying 500.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,500.txt,,regex_name_pattern_a00,18,22,BETH,Name,,
1,500.txt,,regex_name_pattern_a00,23,29,ISRAEL,Name,,
2,500.txt,,regex_name_pattern_a00,110,118,MARGARET,Name,,
3,500.txt,,regex_name_pattern_a00,195,200,BAYES,Name,,
4,500.txt,,regex_name_pattern_a00,201,209,GIOVANNI,Name,,


De-identifying 501.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,501.txt,,regex_name_pattern_a00,18,22,BETH,Name,,
1,501.txt,,regex_name_pattern_a00,23,29,ISRAEL,Name,,
2,501.txt,,regex_name_pattern_a00,108,114,AURORA,Name,,
3,501.txt,,regex_name_pattern_a00,220,225,TRAUM,Name,,
4,501.txt,,regex_name_pattern_a00,1394,1400,HESTER,Name,,


De-identifying 502.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,502.txt,,regex_name_pattern_a00,18,22,BETH,Name,,
1,502.txt,,regex_name_pattern_a00,23,29,ISRAEL,Name,,
2,502.txt,,regex_name_pattern_a00,102,109,TODESCO,Name,,
3,502.txt,,regex_name_pattern_a00,110,116,JAMAAL,Name,,
4,502.txt,,regex_name_pattern_a00,185,190,FUNKE,Name,,


De-identifying 503.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,503.txt,,regex_name_pattern_a0,1276,1284,November,Name,,
1,503.txt,,regex_name_pattern_a0,1300,1306,August,Name,,
2,503.txt,,regex_name_pattern_a00,18,22,BETH,Name,,
3,503.txt,,regex_name_pattern_a00,23,29,ISRAEL,Name,,
4,503.txt,,regex_name_pattern_a00,102,109,TODESCO,Name,,


De-identifying 504.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,504.txt,,regex_name_pattern_a00,18,22,BETH,Name,,
1,504.txt,,regex_name_pattern_a00,23,29,ISRAEL,Name,,
2,504.txt,,regex_name_pattern_a00,846,850,June,Name,,
3,504.txt,,regex_name_pattern_a00,1556,1561,Franz,Name,,
4,504.txt,,regex_name_pattern_a00,1699,1707,FREDERIC,Name,,


De-identifying 505.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,505.txt,,regex_name_pattern_a00,18,22,BETH,Name,,
1,505.txt,,regex_name_pattern_a00,23,29,ISRAEL,Name,,
2,505.txt,,regex_name_pattern_a00,109,115,FUMIKO,Name,,
3,505.txt,,regex_name_pattern_a00,1574,1578,ANNA,Name,,
4,505.txt,,regex_name_pattern_a1,102,108,KARTIA,Name,,


De-identifying 506.txt
First 5 rows of deid:


Unnamed: 0,document_id,annotation_id,annotator,start,stop,entity,entity_type,comment,confidence
0,506.txt,,regex_name_pattern_a00,18,22,BETH,Name,,
1,506.txt,,regex_name_pattern_a00,23,29,ISRAEL,Name,,
2,506.txt,,regex_name_pattern_a00,102,109,SHEPARD,Name,,
3,506.txt,,regex_name_pattern_a00,110,114,JOHN,Name,,
4,506.txt,,regex_name_pattern_a00,193,197,LINA,Name,,


# Concatenate bert/pydeid output and write out to file

In [11]:
for f in reports_list:
    ann = pd.concat([anns_bert[f], anns_pydeid[f]], ignore_index=True)
    
    # get output filename
    out_fn = f
    if out_fn[-4:] == '.txt':
        out_fn = out_fn[:-4]
    out_fn += '.gs'
    
    out_fn = os.path.join('tests', 'fake-data', 'radiology-reports', out_fn)
    print(f'Outputting to {out_fn}')
    ann.to_csv(out_fn, index=False)

Outputting to tests/radiology-reports/500.gs


FileNotFoundError: [Errno 2] No such file or directory: 'tests/radiology-reports/500.gs'