In [None]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

with open('../creds.txt', 'r') as file:
    creds = file.read()

cn = create_engine(f"postgresql://bch:{creds}@compute-e-16-229:54320/eps", 
                     connect_args={'options': '-csearch_path={}'.format('bch')}).execution_options(autocommit=True)


In [None]:
# get patients to exclude - which don't match inclusion criteria

In [None]:
days_before = 365
days_after = 365*2
min_age = 2
min_year = 2000
req_freq = 0.0001
pd.set_option('display.max_colwidth', None)

In [None]:
pat_query = f"""
SELECT patient_num FROM coverage_time 
WHERE (days_before_eps > {days_before} or age_onset < {min_age}) AND (days_after_eps > {days_after})
AND eps_onset_date BETWEEN TO_DATE('01-01-{min_year}','DD-MM-YY') AND TO_DATE('01-01-2020','DD-MM-YY')
AND age_onset BETWEEN 0 AND 21
"""

exclude_pat_df = pd.read_sql(pat_query, cn)

In [None]:
# query = f"""SELECT * FROM information_schema.tables WHERE table_schema = 'bch';"""
# pd.read_sql(query, cn)


In [None]:
train_pat_query = f"""
SELECT patient_num 
FROM patient_dimension where patient_num NOT IN ({pat_query})"""
include_pat_df = pd.read_sql(train_pat_query, cn)
include_pat_df.shape

In [None]:
# then select notes from other patients
query = f"""SELECT * FROM CONCEPT_DIMENSION WHERE concept_cd = 'NOTE:3268562'"""
pd.read_sql(query, cn)

In [None]:
popular_note_query = f"""
SELECT n.concept_cd, count(1) 
FROM notes n 
JOIN ({pat_query}) p
ON n.patient_num = p.patient_num
GROUP BY n.concept_cd
"""

# popular_notes = pd.read_sql(popular_note_query, cn)
# popular_notes.to_csv('./resources/popular_notes.csv', index=False)
popular_notes = pd.read_csv('../resources/popular_notes.csv')

In [None]:
popular_notes.sort_values(by='count', ascending=False)

In [None]:
from os.path import exists

top_rows = popular_notes.sort_values(by='count', ascending=False)

j = 0
for i, row in top_rows.iterrows():
    j+=1
    path_to_file = f"/n/data1/hms/dbmi/beaulieu-jones/lab/epilepsy-transformer/raw/{row['concept_cd']}.txt"
    
    file_exists = exists(path_to_file)
    # print(file_exists)
    if not file_exists:
        print(f"{path_to_file} doesn't exist")
    if j > 100:
        break

In [None]:
# pop_note_list = popular_notes[popular_notes['count']>10].sort_values(by='count', ascending=False)['concept_cd'].tolist()
pop_note_list = popular_notes[popular_notes['count']>10]['concept_cd'].tolist()
pop_note_list[:5]

In [None]:
# query = f"""SELECT * FROM CONCEPT_DIMENSION WHERE CONCEPT_CD ='NOTE:15611138'"""
# pd.read_sql(query, cn)

In [None]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

from tqdm.autonotebook import tqdm
import swifter
tqdm.pandas()
import time

note_limit = 1000000
OUTPUT_DIR = '/n/data1/hms/dbmi/beaulieu-jones/lab/epilepsy-transformer/' 

i=0
for concept_cd in pop_note_list:
    output_file_name = f"{OUTPUT_DIR}/raw/{concept_cd}.txt"
    i+=1
    
    file_exists = exists(output_file_name)
    if file_exists:
        print('file exists')
    else:
        print(i, concept_cd)
        
        note_query = f"""
        SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id
        FROM notes n 
        JOIN ({train_pat_query}) p
        ON n.patient_num = p.patient_num
        WHERE n.concept_cd = '{concept_cd}' AND length(n.observation_blob) > 50
        LIMIT {note_limit}
        """
        # print(note_query)

        before = time.time()
        notes = pd.read_sql(note_query, cn)
        notes['text'] = notes['observation_blob']
        print(concept_cd, notes.shape)
        if notes.shape[0] > 10:
            notes.to_csv(output_file_name, index=False)
        after = time.time()
        print(f"{i} / {len(pop_note_list)} - {after-before}")

In [None]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

from tqdm.autonotebook import tqdm
import swifter
tqdm.pandas()
import time
note_limit = 1000

import os

i = 0
for concept_cd in pop_note_list:
    category = f'{concept_cd}'
    output_file_name = f"{OUTPUT_DIR}/raw/{category}.txt"
    i+=1
    
    file_exists = os.path.exists(output_file_name)
    if file_exists:
        print('file exists')
        continue
    
    note_query = f"""
SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
cd.name_char, cd.concept_path 
FROM notes n 
JOIN ({train_pat_query}) p
ON n.patient_num = p.patient_num
JOIN CONCEPT_DIMENSION cd ON 
cd.concept_cd = n.concept_cd
WHERE n.concept_cd = '{concept_cd}' AND length(n.observation_blob) > 50
LIMIT {note_limit}
"""
    
    before = time.time()
    notes = pd.read_sql(note_query, cn)
    notes['text'] = notes['observation_blob']
    notes.to_csv(output_file_name, index=False)
    after = time.time()
    print(f"{i} / {len(pop_note_list)} - {after-before}")

In [None]:
!rm /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/*.sh

In [None]:
from os.path import exists

j=0
for concept in pop_note_list:
    job_string = f"""#!/bin/bash
#SBATCH -t 0-12:00
#SBATCH -n 1
#SBATCH -p gpu_zak
#SBATCH	--account=zak_contrib_isk1
#SBATCH --mem=32G
#SBATCH -o /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/output/{concept}_%j.out
#SBATCH -e /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/err/{concept}_%j.err
module load conda2
source activate transformer
python3 -u /home/bkb12/notebooks/transformer/process_concept.py --concept_cd '{concept}' 
    """
    
    if not exists(f"/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/preproc_sent/{concept}.txt"):
        j+=1
    #     with open(f"/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/jobs/{concept}.sh", 'w') as f:
    #         f.write(job_string)
        
print(j)

In [None]:
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.

In [None]:
import psycopg2
import pandas as pd
import sys
import spacy
import re
import stanfordnlp
import time
import scispacy
from tqdm import tqdm
from heuristic_tokenize import sent_tokenize_rules 
from spacy.language import Language

OUTPUT_DIR = '/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/' #this path will contain tokenized notes. This dir will be the input dir for create_pretrain_data.sh
nlp = spacy.load('en_core_sci_md', disable=['tagger','ner'])
nlp.add_pipe('sbd_component', first=True) 

#setting sentence boundaries
@Language.component('sbd_component')
def sbd_component(doc):
    for i, token in enumerate(doc[:-2]):
        # define sentence start if period + titlecase token
        if token.text == '.' and doc[i+1].is_title:
            doc[i+1].sent_start = True
        if token.text == '-' and doc[i+1].text != '-':
            doc[i+1].sent_start = True
    return doc

#convert de-identification text into one token
def fix_deid_tokens(text, processed_text):
    deid_regex  = r"\[\*\*.{0,15}.*?\*\*\]" 
    if text:
        indexes = [m.span() for m in re.finditer(deid_regex,text,flags=re.IGNORECASE)]
    else:
        indexes = []
    for start,end in indexes:
        processed_text.merge(start_idx=start,end_idx=end)
    return processed_text
    

def process_section(section, note, processed_sections):
    # perform spacy processing on section
    processed_section = nlp(section['sections'])
    processed_section = fix_deid_tokens(section['sections'], processed_section)
    processed_sections.append(processed_section)

def process_note_helper(note):
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    processed_sections = []
    section_frame = pd.DataFrame({'sections':note_sections})
    section_frame.apply(process_section, args=(note,processed_sections,), axis=1)
    return(processed_sections)

def process_text(sent, note):
    sent_text = sent['sents'].text
    if len(sent_text) > 0 and sent_text.strip() != '\n':
        if '\n' in sent_text:
            sent_text = sent_text.replace('\n', ' ')
        note['text'] += sent_text + '\n'  

def get_sentences(processed_section, note):
    # get sentences from spacy processing
    sent_frame = pd.DataFrame({'sents': list(processed_section['sections'].sents)})
    sent_frame.apply(process_text, args=(note,), axis=1)

def process_note(note):
    try:
        note_text = note['text'] 
        note['text'] = ''
        processed_sections = process_note_helper(note_text)
        ps = {'sections': processed_sections}
        ps = pd.DataFrame(ps)
        ps.apply(get_sentences, args=(note,), axis=1)
        return note 
    except Exception as e:
        # pass
        print ('error', e)


In [None]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

from tqdm.autonotebook import tqdm
import swifter
tqdm.pandas()
import time
note_limit = 100000000

import os

i = 0
for concept_cd in pop_note_list:
    category = f'{concept_cd}'
    output_file_name = f"{OUTPUT_DIR}{category}.txt"
    i+=1
    
    file_exists = os.path.exists(output_file_name)
    if file_exists:
        continue
    
    note_query = f"""
SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
cd.name_char, cd.concept_path 
FROM notes n 
JOIN ({train_pat_query}) p
ON n.patient_num = p.patient_num
JOIN CONCEPT_DIMENSION cd ON 
cd.concept_cd = n.concept_cd
WHERE n.concept_cd = '{concept_cd}' AND length(n.observation_blob) > 50
LIMIT {note_limit}
"""
    
    before = time.time()
    notes = pd.read_sql(note_query, cn)
    notes['text'] = notes['observation_blob']
    # display(notes)
    after = time.time()
    print(f"{concept_cd} ({i}/{len(pop_note_list)}): {after-before}")
    print('Number of notes: %d' %len(notes.index))
    notes['ind'] = list(range(len(notes.index)))
    formatted_notes = notes.progress_apply(process_note, axis=1)
    
    start = time.time()
    if formatted_notes.shape[0] > 0:
        
        print(output_file_name)
        with open(output_file_name,'w') as f:
            for text in formatted_notes['text']:
                if text != None and len(text) != 0 :
                    f.write(text)
                    f.write('\n')

        end = time.time()
        print (end-start)
        print ("Done formatting notes")
    

    
    
    

In [None]:
note_query = "SELECT * FROM notes n WHERE n.concept_cd IN ('NOTE:15611138') LIMIT 10"
notes = pd.read_sql(note_query, cn)
notes

In [None]:
# total count - 4,711,809


# note_query = f"""
# SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
# cd.name_char, cd.concept_path FROM notes n 
# JOIN ({train_pat_query}) p
# ON n.patient_num = p.patient_num
# JOIN CONCEPT_DIMENSION cd ON 
# cd.concept_cd = n.concept_cd
# WHERE n.concept_cd NOT IN ('NOTE:3691317', 'NOTE:3268562')
# LIMIT {note_limit}
# """

note_limit = 20

note_query = f"""
SELECT encounter_num, n.patient_num, n.concept_cd, observation_blob, note_id, 
cd.name_char, cd.concept_path 
FROM notes n 
JOIN ({train_pat_query}) p
ON n.patient_num = p.patient_num
JOIN CONCEPT_DIMENSION cd ON 
cd.concept_cd = n.concept_cd
WHERE n.concept_cd IN ('NOTE:3268565')
LIMIT {note_limit}
"""

notes = pd.read_sql(note_query, cn)

In [None]:
notes

In [None]:
# notes = notes[notes['category'] == category]
category = f'notelim_{note_limit}'
print('Number of notes: %d' %len(notes.index))
notes['ind'] = list(range(len(notes.index)))

In [None]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import warnings 
warnings.filterwarnings(action='ignore')

In [None]:
# from tqdm.autonotebook import tqdm
# import swifter
# formatted_notes = notes.swifter.apply(process_note, axis=1)

In [None]:
from tqdm.autonotebook import tqdm
tqdm.pandas()
formatted_notes = notes.progress_apply(process_note, axis=1)

In [None]:
with open(OUTPUT_DIR  + category + '.txt','w') as f:
    for text in formatted_notes['text']:
        if text != None and len(text) != 0 :
            f.write(text)
            f.write('\n')

end = time.time()
print (end-start)
print ("Done formatting notes")

In [None]:
import os

directory = "/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/preproc_sent"

filename_list = []
for filename in os.listdir(directory):
    # print(filename[:-4])
    filename_list.append(filename[:-4])
    
# joined_string = ", ".join(filename_list)
# print(joined_string)

In [None]:
!rm /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/*.sh

In [None]:
from os.path import exists

j=0

for concept in filename_list:
    job_string = f"""#!/bin/bash
#SBATCH -t 0-12:00
#SBATCH -n 1
#SBATCH -p gpu_zak
#SBATCH	--account=zak_contrib_isk1
#SBATCH --mem=32G
#SBATCH -o /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/output/{concept}_%j.out
#SBATCH -e /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/err/{concept}_%j.err
module load conda2 gcc/9.2.0 cuda/11.2
source activate tf
python3 /home/bkb12/notebooks/transformer/create_pretraining_data.py --input_file /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/preproc_sent/{concept}.txt --output_file /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/pretraining_data/{concept}.txt  --vocab_file /home/bkb12/notebooks/transformer/resources/vocab.txt 
    """
    
    if not exists(f'/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/pretraining_data/{concept}.txt'):
        j+=1
        with open(f'/n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/bert_preprocess_jobs/{concept}.sh', 'w') as f:
            f.write(job_string)
        
print(j)

In [None]:
!ls /n/data1/hms/dbmi/beaulieu-jones/lab/transformer_training_data/pretraining_data/ | wc -l