In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

with open('../creds.txt', 'r') as file:
    creds = file.read()

cn = create_engine(f"postgresql://bch:{creds}@compute-e-16-229:54320/eps", 
                     connect_args={'options': '-csearch_path={}'.format('bch')}).execution_options(autocommit=True)


In [2]:
# get patients to exclude - which don't match inclusion criteria

In [3]:
days_before = 365
days_after = 365*2
min_age = 2
min_year = 2000
req_freq = 0.0001
pd.set_option('display.max_colwidth', None)

In [4]:
pat_query = f"""
SELECT patient_num FROM coverage_time 
WHERE (days_before_eps > {days_before} or age_onset < {min_age}) AND (days_after_eps > {days_after})
AND eps_onset_date BETWEEN TO_DATE('01-01-{min_year}','DD-MM-YY') AND TO_DATE('01-01-2020','DD-MM-YY')
AND age_onset BETWEEN 0 AND 21
"""

exclude_pat_df = pd.read_sql(pat_query, cn)

In [5]:
# query = f"""SELECT * FROM information_schema.tables WHERE table_schema = 'bch';"""
# pd.read_sql(query, cn)


In [6]:
train_pat_query = f"""
SELECT patient_num 
FROM patient_dimension where patient_num NOT IN ({pat_query})"""
include_pat_df = pd.read_sql(train_pat_query, cn)
include_pat_df.shape

(66950, 1)

In [7]:
# then select notes from other patients
query = f"""SELECT * FROM CONCEPT_DIMENSION WHERE concept_cd = 'NOTE:3268562'"""
pd.read_sql(query, cn)

Unnamed: 0,concept_path,concept_cd,name_char,concept_blob,update_date,download_date,import_date,sourcesystem_cd,upload_id
0,\i2b2\Notes\ALL DOCUMENT SECTIONS\Path Report\Preliminary Report\NOTE:3268562,NOTE:3268562,Preliminary Report,,25-APR-16 05.13.52.000000000 PM,25-APR-16 05.13.52.000000000 PM,25-APR-16 05.13.52.000000000 PM,EVENT_CD_KEY:126293,


In [10]:
popular_note_query = f"""
SELECT n.concept_cd, count(1) 
FROM notes n 
JOIN ({pat_query}) p
ON n.patient_num = p.patient_num
GROUP BY n.concept_cd
"""

# popular_notes = pd.read_sql(popular_note_query, cn)
# popular_notes.to_csv('./resources/popular_notes.csv', index=False)
popular_notes = pd.read_csv('../resources/popular_notes.csv')

In [11]:
popular_notes.sort_values(by='count', ascending=False)

Unnamed: 0,concept_cd,count
2507,NOTE:3268562,332694
4481,NOTE:3268565,235226
3661,NOTE:67621147,231908
2667,NOTE:9710694,203205
4899,NOTE:125942859,141230
...,...,...
1903,NOTE:427779475,1
1900,NOTE:1026560991,1
4422,NOTE:560489215,1
4423,NOTE:122778229,1


In [15]:
from os.path import exists

top_rows = popular_notes.sort_values(by='count', ascending=False)

j = 0
for i, row in top_rows.iterrows():
    j+=1
    path_to_file = f"/n/data1/hms/dbmi/beaulieu-jones/lab/epilepsy-transformer/raw/{row['concept_cd']}.txt"
    
    file_exists = exists(path_to_file)
    # print(file_exists)
    if not file_exists:
        print(f"{path_to_file} doesn't exist")
    if j > 100:
        break

In [11]:
# pop_note_list = popular_notes[popular_notes['count']>10].sort_values(by='count', ascending=False)['concept_cd'].tolist()
pop_note_list = popular_notes[popular_notes['count']>10]['concept_cd'].tolist()
pop_note_list[:5]

['NOTE:904165203',
 'NOTE:138865080',
 'NOTE:3691360',
 'NOTE:3446957',
 'NOTE:359547497']

In [12]:
# query = f"""SELECT * FROM CONCEPT_DIMENSION WHERE CONCEPT_CD ='NOTE:15611138'"""
# pd.read_sql(query, cn)

In [14]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

from tqdm.autonotebook import tqdm
import swifter
tqdm.pandas()
import time

note_limit = 1000000
OUTPUT_DIR = '/n/data1/hms/dbmi/beaulieu-jones/lab/epilepsy-transformer/' 

i=0
for concept_cd in pop_note_list:
    output_file_name = f"{OUTPUT_DIR}/raw/{concept_cd}.txt"
    i+=1
    
    file_exists = exists(output_file_name)
    if file_exists:
        print('file exists')
    else:
        print(i, concept_cd)
        
        note_query = f"""
        SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id
        FROM notes n 
        JOIN ({train_pat_query}) p
        ON n.patient_num = p.patient_num
        WHERE n.concept_cd = '{concept_cd}' AND length(n.observation_blob) > 50
        LIMIT {note_limit}
        """
        # print(note_query)

        before = time.time()
        notes = pd.read_sql(note_query, cn)
        notes['text'] = notes['observation_blob']
        print(concept_cd, notes.shape)
        if notes.shape[0] > 10:
            notes.to_csv(output_file_name, index=False)
        after = time.time()
        print(f"{i} / {len(pop_note_list)} - {after-before}")

file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
3777 NOTE:482497607
NOTE:482497607 (1, 6)
3777 / 3198 - 0.12610936164855957
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
3791 NOTE:867560219
NOTE:867560219 (0, 6)
3791 / 3198 - 0.005064964294433594
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
file exists
3818 NOTE:482497565
NOTE:482497565 (1, 6)
3818 / 3198 - 0.07549786567687988
file exists
file exists
file exists
file exists
file exists
file exists
file exists
3826 NOTE:3691326
NOTE:3691326 (0, 6)
3826 / 3198 -

In [74]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

from tqdm.autonotebook import tqdm
import swifter
tqdm.pandas()
import time
note_limit = 1000

import os

i = 0
for concept_cd in pop_note_list:
    category = f'{concept_cd}'
    output_file_name = f"{OUTPUT_DIR}/raw/{category}.txt"
    i+=1
    
    file_exists = os.path.exists(output_file_name)
    if file_exists:
        print('file exists')
        continue
    
    note_query = f"""
SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
cd.name_char, cd.concept_path 
FROM notes n 
JOIN ({train_pat_query}) p
ON n.patient_num = p.patient_num
JOIN CONCEPT_DIMENSION cd ON 
cd.concept_cd = n.concept_cd
WHERE n.concept_cd = '{concept_cd}' AND length(n.observation_blob) > 50
LIMIT {note_limit}
"""
    
    before = time.time()
    notes = pd.read_sql(note_query, cn)
    notes['text'] = notes['observation_blob']
    notes.to_csv(output_file_name, index=False)
    after = time.time()
    print(f"{i} / {len(pop_note_list)} - {after-before}")

758 / 3198 - 0.5721535682678223
759 / 3198 - 0.10526323318481445
760 / 3198 - 0.07719922065734863
761 / 3198 - 0.6063940525054932
762 / 3198 - 0.10484790802001953
763 / 3198 - 0.009305238723754883
764 / 3198 - 0.16199851036071777
765 / 3198 - 0.09736323356628418
766 / 3198 - 0.1057744026184082
767 / 3198 - 1.8205039501190186
768 / 3198 - 0.11835551261901855
769 / 3198 - 0.06748318672180176
770 / 3198 - 0.006815671920776367
771 / 3198 - 0.09964585304260254
772 / 3198 - 0.00782155990600586
773 / 3198 - 0.016118764877319336
774 / 3198 - 0.08583450317382812
775 / 3198 - 0.07072806358337402
776 / 3198 - 0.07266807556152344
777 / 3198 - 0.008714675903320312
778 / 3198 - 0.0810546875
779 / 3198 - 0.0073168277740478516
780 / 3198 - 0.0691080093383789
781 / 3198 - 6.3795247077941895
782 / 3198 - 0.11394906044006348
783 / 3198 - 0.07102489471435547
784 / 3198 - 0.07099652290344238
785 / 3198 - 0.20271921157836914
786 / 3198 - 0.007400989532470703
787 / 3198 - 0.0068035125732421875
788 / 3198 - 0

In [47]:
!rm /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/*.sh

In [56]:
from os.path import exists

j=0
for concept in pop_note_list:
    job_string = f"""#!/bin/bash
#SBATCH -t 0-12:00
#SBATCH -n 1
#SBATCH -p gpu_zak
#SBATCH	--account=zak_contrib_isk1
#SBATCH --mem=32G
#SBATCH -o /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/output/{concept}_%j.out
#SBATCH -e /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/err/{concept}_%j.err
module load conda2
source activate transformer
python3 -u /home/bkb12/notebooks/transformer/process_concept.py --concept_cd '{concept}' 
    """
    
    if not exists(f"/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/preproc_sent/{concept}.txt"):
        j+=1
    #     with open(f"/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/{concept}.sh", 'w') as f:
    #         f.write(job_string)
        
print(j)

0


In [11]:
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.

In [48]:
import psycopg2
import pandas as pd
import sys
import spacy
import re
import stanfordnlp
import time
import scispacy
from tqdm import tqdm
from heuristic_tokenize import sent_tokenize_rules 
from spacy.language import Language

OUTPUT_DIR = '/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/' #this path will contain tokenized notes. This dir will be the input dir for create_pretrain_data.sh
nlp = spacy.load('en_core_sci_md', disable=['tagger','ner'])
nlp.add_pipe('sbd_component', first=True) 

#setting sentence boundaries
@Language.component('sbd_component')
def sbd_component(doc):
    for i, token in enumerate(doc[:-2]):
        # define sentence start if period + titlecase token
        if token.text == '.' and doc[i+1].is_title:
            doc[i+1].sent_start = True
        if token.text == '-' and doc[i+1].text != '-':
            doc[i+1].sent_start = True
    return doc

#convert de-identification text into one token
def fix_deid_tokens(text, processed_text):
    deid_regex  = r"\[\*\*.{0,15}.*?\*\*\]" 
    if text:
        indexes = [m.span() for m in re.finditer(deid_regex,text,flags=re.IGNORECASE)]
    else:
        indexes = []
    for start,end in indexes:
        processed_text.merge(start_idx=start,end_idx=end)
    return processed_text
    

def process_section(section, note, processed_sections):
    # perform spacy processing on section
    processed_section = nlp(section['sections'])
    processed_section = fix_deid_tokens(section['sections'], processed_section)
    processed_sections.append(processed_section)

def process_note_helper(note):
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    processed_sections = []
    section_frame = pd.DataFrame({'sections':note_sections})
    section_frame.apply(process_section, args=(note,processed_sections,), axis=1)
    return(processed_sections)

def process_text(sent, note):
    sent_text = sent['sents'].text
    if len(sent_text) > 0 and sent_text.strip() != '\n':
        if '\n' in sent_text:
            sent_text = sent_text.replace('\n', ' ')
        note['text'] += sent_text + '\n'  

def get_sentences(processed_section, note):
    # get sentences from spacy processing
    sent_frame = pd.DataFrame({'sents': list(processed_section['sections'].sents)})
    sent_frame.apply(process_text, args=(note,), axis=1)

def process_note(note):
    try:
        note_text = note['text'] 
        note['text'] = ''
        processed_sections = process_note_helper(note_text)
        ps = {'sections': processed_sections}
        ps = pd.DataFrame(ps)
        ps.apply(get_sentences, args=(note,), axis=1)
        return note 
    except Exception as e:
        # pass
        print ('error', e)


In [68]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

from tqdm.autonotebook import tqdm
import swifter
tqdm.pandas()
import time
note_limit = 100000000

import os

i = 0
for concept_cd in pop_note_list:
    category = f'{concept_cd}'
    output_file_name = f"{OUTPUT_DIR}{category}.txt"
    i+=1
    
    file_exists = os.path.exists(output_file_name)
    if file_exists:
        continue
    
    note_query = f"""
SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
cd.name_char, cd.concept_path 
FROM notes n 
JOIN ({train_pat_query}) p
ON n.patient_num = p.patient_num
JOIN CONCEPT_DIMENSION cd ON 
cd.concept_cd = n.concept_cd
WHERE n.concept_cd = '{concept_cd}' AND length(n.observation_blob) > 50
LIMIT {note_limit}
"""
    
    before = time.time()
    notes = pd.read_sql(note_query, cn)
    notes['text'] = notes['observation_blob']
    # display(notes)
    after = time.time()
    print(f"{concept_cd} ({i}/{len(pop_note_list)}): {after-before}")
    print('Number of notes: %d' %len(notes.index))
    notes['ind'] = list(range(len(notes.index)))
    formatted_notes = notes.progress_apply(process_note, axis=1)
    
    start = time.time()
    if formatted_notes.shape[0] > 0:
        
        print(output_file_name)
        with open(output_file_name,'w') as f:
            for text in formatted_notes['text']:
                if text != None and len(text) != 0 :
                    f.write(text)
                    f.write('\n')

        end = time.time()
        print (end-start)
        print ("Done formatting notes")
    

    
    
    

NOTE:904165203 (1/3198): 0.009814739227294922
Number of notes: 0


0it [00:00, ?it/s]

error 'text'
NOTE:188707860 (6/3198): 0.0051229000091552734
Number of notes: 0


0it [00:00, ?it/s]

error 'text'
NOTE:3710477 (7/3198): 3.296783447265625
Number of notes: 25681


  0%|          | 0/25681 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [27]:
note_query = "SELECT * FROM notes n WHERE n.concept_cd IN ('NOTE:15611138') LIMIT 10"
notes = pd.read_sql(note_query, cn)
notes

Unnamed: 0,encounter_num,patient_num,concept_cd,provider_id,start_date,modifier_cd,instance_num,valtype_cd,tval_char,nval_num,end_date,location_cd,observation_blob,note_id
0,1226847054,92199258,NOTE:15611138,-1,18-JUN-19 06.14.00.000000000 PM,@,5213221000.0,B,Past Medical History,,,,"Previously healthy 3yo, now diagnosed with B-ALL\n\nSee Discharge Summary for more information",80
1,1226847198,73259763,NOTE:15611138,-1,16-JUL-19 12.52.00.000000000 PM,@,5257009000.0,B,Past Medical History,,,,"Larsen syndrome\nrepaired truncus arteriosus with RV-PA conduit\nchronic lung disease requiring nocturnal BiPAP\ncardiac arrest x2\nseizure disorder, weaning keppra",186
2,1226847361,87864457,NOTE:15611138,-1,27-JUL-19 05.18.00.000000000 PM,@,5275773000.0,B,Past Medical History,,,,laryngommalacia\npremie\nGERD\nRSV+ 1/2019\ninfantile spasms\npolymicrogyria,266
3,1226847418,1896551,NOTE:15611138,-1,15-AUG-19 03.45.00.000000000 PM,@,5306513000.0,B,Past Medical History,,,,"Dx with seizures at age 13yrs, GTC, absence seizures\nLast GTC in July\nAbsence seizures occur 2-3 times weekly , several times a day.",313
4,1226847461,78751533,NOTE:15611138,-1,11-JUN-19 06.15.00.000000000 PM,@,5201560000.0,B,Past Medical History,,,,Ezra is an ex full term male infant with history of severe intracranial hemorrhage at birth with subsequent development of porencephaly and encephalomalacia. s/p ETC and CPC on 1/30/19. VP shunt placed 05/30.,345
5,1226847461,78751533,NOTE:15611138,-1,13-JUN-19 04.18.00.000000000 AM,@,5203943000.0,B,Past Medical History,,,,Ezra is an ex full term male infant with history of severe intracranial hemorrhage at birth with subsequent development of porencephaly and encephalomalacia. s/p ETV and CPC on 1/30/19. VP shunt placed 05/30.,346
6,1226847461,78751533,NOTE:15611138,-1,14-JUN-19 12.15.00.000000000 PM,@,5206478000.0,B,Past Medical History,,,,Ezra is an ex full term male infant with history of severe intracranial hemorrhage at birth with subsequent development of porencephaly and encephalomalacia. s/p ETV and CPC on 1/30/19. VP shunt placed 05/30.,347
7,1226847604,3807478,NOTE:15611138,-1,11-AUG-19 12.10.00.000000000 PM,@,5299561000.0,B,Past Medical History,,,,Encephalopathy \nHomonymous hemianopia. \nIntractable epilepsy. \nLearning disability. \nSturge-Weber syndrome.,401
8,1226847813,94132888,NOTE:15611138,-1,12-JUN-19 03.56.00.000000000 AM,@,5202149000.0,B,Past Medical History,,,,"3 wk old prenatally diagnosed with Ebsteins, born and followed at BMC, has been home, stable - feeding and growing. On 6/11 had choking episode while in her carseat, seen in ED at South Short hospital with plan for transfer to BMC - had apneic event prompting intubation. Transferred to BCH for further work up and mangement.",476
9,1226847813,94132888,NOTE:15611138,-1,13-JUN-19 04.39.00.000000000 PM,@,5205175000.0,B,Past Medical History,,,,"3 wk old prenatally diagnosed with Ebsteins, born and followed at BMC, has been home, stable - feeding and growing. On 6/11 had choking episode while in her carseat, seen in ED at South Short hospital with plan for transfer to BMC - had apneic event prompting intubation. Transferred to BCH for further work up and mangement. \n\n6/11- Transferred to BCH 8S from South Shore Hospital, ECHO @ bedside + closed duct and coarc, pGE started\n6/12- Re-Echo, open duct, maintained on pGE and dopamine, diuretics increased\n6/13- To CVOR for CoArc repair and PDA closure with Dr. Kaza",477


In [24]:
# total count - 4,711,809


# note_query = f"""
# SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
# cd.name_char, cd.concept_path FROM notes n 
# JOIN ({train_pat_query}) p
# ON n.patient_num = p.patient_num
# JOIN CONCEPT_DIMENSION cd ON 
# cd.concept_cd = n.concept_cd
# WHERE n.concept_cd NOT IN ('NOTE:3691317', 'NOTE:3268562')
# LIMIT {note_limit}
# """

note_limit = 20

note_query = f"""
SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
cd.name_char, cd.concept_path 
FROM notes n 
JOIN ({train_pat_query}) p
ON n.patient_num = p.patient_num
JOIN CONCEPT_DIMENSION cd ON 
cd.concept_cd = n.concept_cd
WHERE n.concept_cd IN ('NOTE:3268565')
LIMIT {note_limit}
"""

notes = pd.read_sql(note_query, cn)

KeyboardInterrupt: 

In [None]:
notes

In [13]:
# notes = notes[notes['category'] == category]
category = f'notelim_{note_limit}'
print('Number of notes: %d' %len(notes.index))
notes['ind'] = list(range(len(notes.index)))

Number of notes: 1000


In [15]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

In [21]:
# from tqdm.autonotebook import tqdm
# import swifter
# formatted_notes = notes.swifter.apply(process_note, axis=1)

In [40]:
from tqdm.autonotebook import tqdm
tqdm.pandas()
formatted_notes = notes.progress_apply(process_note, axis=1)

  0%|          | 0/10 [00:00<?, ?it/s]

In [20]:
with open(OUTPUT_DIR  + category + '.txt','w') as f:
    for text in formatted_notes['text']:
        if text != None and len(text) != 0 :
            f.write(text)
            f.write('\n')

end = time.time()
print (end-start)
print ("Done formatting notes")

KeyError: 'text'

In [61]:
import os

directory = "/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/preproc_sent"

filename_list = []
for filename in os.listdir(directory):
    # print(filename[:-4])
    filename_list.append(filename[:-4])
    
# joined_string = ", ".join(filename_list)
# print(joined_string)

In [57]:
!rm /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/*.sh

In [62]:
from os.path import exists

j=0

for concept in filename_list:
    job_string = f"""#!/bin/bash
#SBATCH -t 0-12:00
#SBATCH -n 1
#SBATCH -p gpu_zak
#SBATCH	--account=zak_contrib_isk1
#SBATCH --mem=32G
#SBATCH -o /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/output/{concept}_%j.out
#SBATCH -e /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/err/{concept}_%j.err
module load conda2 gcc/9.2.0 cuda/11.2
source activate tf
python3 /home/bkb12/notebooks/transformer/create_pretraining_data.py --input_file /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/preproc_sent/{concept}.txt --output_file /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/pretraining_data/{concept}.txt  --vocab_file /home/bkb12/notebooks/transformer/resources/vocab.txt 
    """
    
    if not exists(f'/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/pretraining_data/{concept}.txt'):
        j+=1
        with open(f'/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/{concept}.sh', 'w') as f:
            f.write(job_string)
        
print(j)

2979


In [1]:
!ls /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/pretraining_data/ | wc -l

3196
