In [16]:
import os, re, pickle, numpy as np

In [17]:
ls raw/concept_assertion_relation_training_data/beth/concept| wc -l

73


In [18]:
ls raw/concept_assertion_relation_training_data/partners/concept| wc -l

97


In [19]:
ls raw/concept_assertion_relation_training_data/beth/

[0m[01;34mast[0m/  [01;34mconcept[0m/  [01;34mrel[0m/  [01;34mtxt[0m/


In [20]:
TRAIN_DIRS = [
    './raw/concept_assertion_relation_training_data/beth/',
    './raw/concept_assertion_relation_training_data/partners/',
]
TEST_DIR = './raw/reference_standard_for_test_data/'

In [36]:
def process_concept(concept_str):
    """
    takes string like
    'c="asymptomatic" 16:2 16:2||t="problem"'
    and returns dictionary like
    {'t': 'problem', 'start_line': 16, 'start_pos': 2, 'end_line': 16, 'end_pos': 2}
    """
    try:
        position_bit, problem_bit = concept_str.split('||')
        t = problem_bit[3:-1]
        
        start_and_end_span = next(re.finditer('\s\d+:\d+\s\d+:\d+', concept_str)).span()
        c = concept_str[3:start_and_end_span[0]-1]
        c = [y for y in c.split(' ') if y.strip() != '']
        c = ' '.join(c)

        start_and_end = concept_str[start_and_end_span[0]+1 : start_and_end_span[1]]
        start, end = start_and_end.split(' ')
        start_line, start_pos = [int(x) for x in start.split(':')]
        end_line, end_pos = [int(x) for x in end.split(':')]
        
        # Stupid and hacky!!!! This particular example raised a bug in my code below.
#         if c == 'folate' and start_line == 43 and start_pos == 3 and end_line == 43 and end_pos == 3:
#             start_pos, end_pos = 2, 2
        
    except:
        print(concept_str)
        raise
    
    return {
        't': t, 'start_line': start_line, 'start_pos': start_pos, 'end_line': end_line, 'end_pos': end_pos,
        'c': c, 
    }

def build_label_vocab(base_dirs):
    seen, label_vocab, label_vocab_size = set(['O']), {'O': 'O'}, 0
    
    for base_dir in base_dirs:
        concept_dir = os.path.join(base_dir, 'concept')

        assert os.path.isdir(concept_dir), "Directory structure doesn't match!"

        ids = set([x[:-4] for x in os.listdir(concept_dir) if x.endswith('.con')])

        for i in ids:
            with open(os.path.join(concept_dir, '%s.con' % i)) as f:
                concepts = [process_concept(x.strip()) for x in f.readlines()]
            for c in concepts:
                if c['t'] not in seen:
                    label_vocab_size += 1
                    label_vocab['B-%s' % c['t']] = 'B-%s' % c['t'] # label_vocab_size
                    label_vocab_size += 1
                    label_vocab['I-%s' % c['t']] = 'I-%s' % c['t'] # label_vocab_size
                    seen.update([c['t']])
    return label_vocab, label_vocab_size

def reformatter(base, label_vocab, txt_dir = None, concept_dir = None):
    if txt_dir is None: txt_dir = os.path.join(base, 'txt')
    if concept_dir is None: concept_dir = os.path.join(base, 'concept')
    
    assert os.path.isdir(txt_dir) and os.path.isdir(concept_dir), "Directory structure doesn't match!"
    
    txt_ids = set([x[:-4] for x in os.listdir(txt_dir) if x.endswith('.txt')])
    concept_ids = set([x[:-4] for x in os.listdir(concept_dir) if x.endswith('.con')])
    
    assert txt_ids == concept_ids, (
        "id set doesn't match: txt - concept = %s, concept - txt = %s"
        "" % (str(txt_ids - concept_ids), str(concept_ids - txt_ids))
    )
    
    ids = txt_ids
    
    reprocessed_texts = {}
    for i in ids:
        with open(os.path.join(txt_dir, '%s.txt' % i), mode='r') as f:
            lines = f.readlines()
            txt = [[y for y in x.strip().split(' ') if y.strip() != ''] for x in lines]
            line_starts_with_space = [x.startswith(' ') for x in lines]
        with open(os.path.join(concept_dir, '%s.con' % i), mode='r') as f:
            concepts = [process_concept(x.strip()) for x in f.readlines()]
            
        labels = [['O' for _ in line] for line in txt]
        for c in concepts:
            if c['start_line'] == c['end_line']:
                line = c['start_line']-1
                p_modifier = -1 if line_starts_with_space[line] else 0
                text = (' '.join(txt[line][c['start_pos']+p_modifier:c['end_pos']+1+p_modifier])).lower()
                assert text == c['c'], (
                    "Text mismatch! %s vs. %s (id: %s, line: %d)\nFull line: %s"
                    "" % (c['c'], text, i, line, txt[line])
                )
                
            for line in range(c['start_line']-1, c['end_line']):
                p_modifier = -1 if line_starts_with_space[line] else 0
                start_pos = c['start_pos']+p_modifier if line == c['start_line']-1 else 0
                end_pos   = c['end_pos']+1+p_modifier if line == c['end_line']-1 else len(txt[line])
                
                if line == c['end_line'] - 1: labels[line][end_pos-1] = label_vocab['I-%s' % c['t']]                
                if line == c['start_line'] - 1: labels[line][start_pos] = label_vocab['B-%s' % c['t']]
                for j in range(start_pos + 1, end_pos-1): labels[line][j] = label_vocab['I-%s' % c['t']]
            
        joined_words_and_labels = [zip(txt_line, label_line) for txt_line, label_line in zip(txt, labels)]

        out_str = '\n\n'.join(
            ['\n'.join(['%s %s' % p for p in joined_line]) for joined_line in joined_words_and_labels]
        )
        
        reprocessed_texts[i] = out_str
        
    return reprocessed_texts

In [37]:
label_vocab, label_vocab_size = build_label_vocab([
    'raw/concept_assertion_relation_training_data/beth/',
    'raw/concept_assertion_relation_training_data/partners/',
#     'raw/reference_standard_for_test_data/'
])

In [38]:
label_vocab

{'O': 'O',
 'B-treatment': 'B-treatment',
 'I-treatment': 'I-treatment',
 'B-problem': 'B-problem',
 'I-problem': 'I-problem',
 'B-test': 'B-test',
 'I-test': 'I-test'}

In [45]:
reprocessed_texts = {
    'beth':     reformatter('raw/concept_assertion_relation_training_data/beth/', label_vocab),
    'partners': reformatter('raw/concept_assertion_relation_training_data/partners/', label_vocab),
    'test':     reformatter(
        'raw/reference_standard_for_test_data/', label_vocab,
        txt_dir='raw/test_data/',
        concept_dir='raw/reference_standard_for_test_data/concept'
    ),
}

In [46]:
for key, txt_by_record in reprocessed_texts.items(): print("%s: %d" % (key, len(txt_by_record)))

beth: 73
partners: 97
test: 256


In [54]:
print(reprocessed_texts['test']['0397'])

Admission O
Date O
: O

2014-04-17 O

Discharge O
Date O
: O

2014-04-25 O

Date O
of O
Birth O
: O

1953-07-04 O

Sex O
: O

M O

Service O
: O

CARDIOTHORACIC O

Allergies O
: O

Patient O
recorded O
as O
having O
No O
Known B-problem
Allergies I-problem
to O
Drugs B-treatment

Attending O
: O
Christopher O
Q O
Thompson O
, O
M.D. O

Chief O
Complaint O
: O

Chest B-problem
pain I-problem

Major O
Surgical O
or O
Invasive O
Procedure O
: O

2014-04-18 O
Off B-treatment
Pump I-treatment
Coronary I-treatment
Artery I-treatment
Bypass I-treatment
Grafting I-treatment
utilizing O
the O
left O
internal O
mammary O
artery O
to O
left O
anterior O
descending O
artery O
, O
free O
right O
internal O
mammary O
to O
obtuse O
marginal O
and O
vein B-treatment
graft I-treatment
to I-treatment
diagonal I-treatment
. O

History O
of O
Present O
Illness O
: O

This O
is O
a O
60 O
year O
old O
male O
with O
new B-problem
onset I-problem
chest I-problem
pain I-problem
. O

Nuclear B-test
myocardial 

In [55]:
np.random.seed(1)

In [56]:
all_partners_train_ids = np.random.permutation(list(reprocessed_texts['partners'].keys()))
N = len(all_partners_train_ids)
N_train = int(0.9 * N)

partners_train_ids = all_partners_train_ids[:N_train]
partners_dev_ids = all_partners_train_ids[N_train:]

In [57]:
print("Partners # Patients: Train: %d, Dev: %d" %(len(partners_train_ids), len(partners_dev_ids)))

Partners # Patients: Train: 87, Dev: 10


In [58]:
all_beth_train_ids = np.random.permutation(list(reprocessed_texts['beth'].keys()))
N = len(all_beth_train_ids)
N_train = int(0.9 * N)

beth_train_ids = all_beth_train_ids[:N_train]
beth_dev_ids = all_beth_train_ids[N_train:]

In [59]:
print("Beth # Patients: Train: %d, Dev: %d" % (len(beth_train_ids), len(beth_dev_ids)))

Beth # Patients: Train: 65, Dev: 8


In [60]:
print("Merged # Patients: Train: %d, Dev: %d" % (
  len(partners_train_ids) + len(beth_train_ids), len(beth_dev_ids) + len(partners_dev_ids)
))

Merged # Patients: Train: 152, Dev: 18


In [61]:
merged_train_txt = '\n\n'.join(np.random.permutation(
    [reprocessed_texts['partners'][i] for i in partners_train_ids] + 
    [reprocessed_texts['beth'][i] for i in beth_train_ids]
))
merged_dev_txt = '\n\n'.join(np.random.permutation(
    [reprocessed_texts['partners'][i] for i in partners_dev_ids] + 
    [reprocessed_texts['beth'][i] for i in beth_dev_ids]
))
merged_test_txt = '\n\n'.join(np.random.permutation(list(reprocessed_texts['test'].values())))

In [62]:
print("Merged # Samples: Train: %d, Dev: %d, Test: %d" % (
    len(merged_train_txt.split('\n\n')),
    len(merged_dev_txt.split('\n\n')),
    len(merged_test_txt.split('\n\n'))
))

Merged # Samples: Train: 14666, Dev: 1649, Test: 27626


In [63]:
partners_train_txt = '\n\n'.join(np.random.permutation(
    [reprocessed_texts['partners'][i] for i in partners_train_ids]
))
partners_dev_txt = '\n\n'.join(np.random.permutation(
    [reprocessed_texts['partners'][i] for i in partners_dev_ids]
))
partners_test_txt = '\n\n'.join(np.random.permutation(list(reprocessed_texts['test'].values())))

In [64]:
OUT_FILES = {
    'merged_train': './processed/merged/train.tsv',
    'merged_dev':   './processed/merged/dev.tsv',
    'merged_test':  './processed/merged/test.tsv', 
    'partners_train': './processed/partners/train.tsv',
    'partners_dev':   './processed/partners/dev.tsv',
    'partners_test':  './processed/partners/test.tsv', 
    'vocab': './processed/label_vocab.pkl'
}

In [67]:
with open(OUT_FILES['merged_train'], mode='w') as f: f.write(merged_train_txt)
with open(OUT_FILES['merged_dev'], mode='w') as f: f.write(merged_dev_txt)
with open(OUT_FILES['merged_test'], mode='w') as f: f.write(merged_test_txt)
with open(OUT_FILES['partners_train'], mode='w') as f: f.write(partners_train_txt)
with open(OUT_FILES['partners_dev'], mode='w') as f: f.write(partners_dev_txt)
with open(OUT_FILES['partners_test'], mode='w') as f: f.write(partners_test_txt)
with open(OUT_FILES['vocab'], mode='wb') as f: pickle.dump(label_vocab, f)