# Preprocess

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
low_resource=True # change as appropriate

if low_resource == True:
    DATA = Path('../data/preprocessed/low_resource/')
    MODEL = Path('../data/transformer-xl/low_resource')
else:
    DATA = Path('../data/preprocessed/')
    MODEL = Path('../data/transformer-xl')
    
MODEL.mkdir(parents=True, exist_ok=True)

### Training set

In [3]:
with open(DATA/'src-train.txt', 'r') as f:
    train_src = f.readlines()
train_src=pd.DataFrame({'text':train_src})

In [4]:
with open(DATA/'tgt-train.txt', 'r') as f:
    train_tgt = f.readlines()
train_tgt=pd.DataFrame({'text':train_tgt})

In [5]:
for i, row in train_src.iterrows():
    src = row['text'][:-1]
    src = src.split()[:512]
    src_len = len(src)
    tgt_len = 1024 - src_len # cap sequence at 1024 tokens in length
    tgt = train_tgt['text'][i]
    tgt = tgt.split()[:tgt_len]
    combined = "= discharge summary = " + '\n' + " ".join(src) + '\n' + "= = note = =" + '\n' + " ".join(tgt)
    row['text'] = combined

In [6]:
train_src['text'][10]

'= discharge summary = \nadmission date : [ 2134/9/1 ] [ month / day <H> M <G> 47 <A> black <E> acute respiratory failure | acute diastolic heart failure | atrial flutter | obesity hypoventilation syndrome | obstructive sleep apnea (adult)(pediatric) | unspecified schizophrenia, unspecified | bipolar disorder, unspecified | obesity, unspecified | pure hypercholesterolemia | diabetes mellitus without mention of complication, type ii or unspecified type, not stated as uncontrolled | congestive heart failure, unspecified | atrial fibrillation | polycythemia vera <D> <P> furosemide , 40mg Tablet | docusate sodium , 100mg Capsule | furosemide , 100mg/10mL Vial <M> <T> Creatine Kinase, MB Isoenzyme , 1 , ng/mL | Urea Nitrogen , 17 , mg/dL | Chloride , 92 , mEq/L , abnormal | Calcium, Total , 8.5 , mg/dL | Bicarbonate , 40 , mEq/L , abnormal | Anion Gap , 12 , mEq/L | Phosphate , 3.6 , mg/dL | Potassium , 3.9 , mEq/L | Sodium , 140 , mEq/L | Hematocrit , 46.6 , % | Creatinine , 0.7 , mg/dL | 

In [7]:
np.savetxt(MODEL/'input-text.txt', train_src, fmt='%s', newline=os.linesep)

### Validation set

In [8]:
with open(DATA/'src-val.txt', 'r') as f:
    val_src = f.readlines()
val_src=pd.DataFrame({'text':val_src})

In [9]:
with open(DATA/'tgt-val.txt', 'r') as f:
    val_tgt = f.readlines()
val_tgt=pd.DataFrame({'text':val_tgt})

In [10]:
for i, row in val_src.iterrows():
    src = row['text'][:-1]
    src = src.split()[:512]
    src_len = len(src)
    tgt_len = 1024 - src_len
    tgt = val_tgt['text'][i]
    tgt = tgt.split()[:tgt_len]
    combined = "= discharge summary = " + '\n' + " ".join(src) + '\n' + "= = note = =" + '\n' + " ".join(tgt)
    row['text'] = combined

In [11]:
np.savetxt(MODEL/'val-input-text.txt', val_src, fmt='%s', newline=os.linesep)

### Test set

In [12]:
with open(DATA/'src-test.txt', 'r') as f:
    test_src = f.readlines()
test_src=pd.DataFrame({'text':test_src})

In [13]:
for i, row in test_src.iterrows():
    src = row['text'][:-1]
    src = src.split()[:512]
    src_len = len(src)
    combined = "= discharge summary = " + '\n' + " ".join(src) + '\n' + "= = note = ="
    row['text'] = combined

In [14]:
np.savetxt(MODEL/'test-input-text.txt', test_src, fmt='%s', newline=os.linesep)