# Preprocess

Get data into the right format. Create datasets for the original data, synthetic data as well as combined together. Lastly also an original 2x

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

In [11]:
DATA = Path('data/')
READMISSION_PREDICTION = DATA/'readmission_prediction/'
READMISSION_PREDICTION.mkdir(exist_ok=True)

data = pd.read_csv(DATA/'preprocessed/ref_test.tsv', sep='\t') # only test file
original = data[['text','30d_unplan_readmit']]

with open(DATA/'t2t_experiments/full_context/output/transformer_decoded/tgt-test.001.txt', 'r') as f:
    synthetic = f.readlines()
synthetic=pd.DataFrame({'text':synthetic})
synthetic['30d_unplan_readmit'] = original['30d_unplan_readmit']

In [12]:
print(len(synthetic))
print(len(original))

5727
5727


In [13]:
original_val_test=original.sample(frac=0.2, replace=False, random_state=1) # 80% train
excluded = list(original_val_test.index.values) 
original_train = original[~original.index.isin(excluded)]
len(original_train)

4582

In [14]:
synthetic_val_test=synthetic.sample(frac=0.2, replace=False, random_state=1) # 80% train
excluded = list(synthetic_val_test.index.values) 
synthetic_train = synthetic[~synthetic.index.isin(excluded)]
len(synthetic_train)

4582

In [15]:
combined = original_train.append(synthetic_train, ignore_index=True)
original_2x = original_train.append(original_train, ignore_index=True)

Split each dataset into train, test and eval and save to disk:

We need to ensure that the validation and test sets have never been seen before by the training set

In [16]:
model = 'transformer'

ORIGINAL = READMISSION_PREDICTION/model/'original/'
ORIGINAL_2X = READMISSION_PREDICTION/model/'original_2x/'
SYNTHETIC = READMISSION_PREDICTION/model/'synthetic/'
COMBINED = READMISSION_PREDICTION/model/'combined/'

ORIGINAL.mkdir(parents=True,exist_ok=True)
ORIGINAL_2X.mkdir(parents=True,exist_ok=True)
SYNTHETIC.mkdir(parents=True,exist_ok=True)
COMBINED.mkdir(parents=True,exist_ok=True)

In [17]:
original_val_test = original_val_test.reset_index(drop=True)
original_train = original_train.reset_index(drop=True)

split_idx_test = int(0.5 * len(original_val_test)) # 10% val, 10% test

val = original_val_test.loc[:split_idx_test]
test = original_val_test[split_idx_test:]

In [18]:
def split_dataset(train, directory):
    
    print (len(train), len(val), len(test))
    
    train.to_csv(directory/'train.csv', sep=',', index = False)
    val.to_csv(directory/'val.csv', sep=',', index = False)
    test.to_csv(directory/'test.csv', sep=',', index = False)
    
    return

In [19]:
split_dataset(original_train, ORIGINAL)
split_dataset(original_2x, ORIGINAL_2X)
split_dataset(synthetic_train, SYNTHETIC)
split_dataset(combined, COMBINED)

4582 573 573
9164 573 573
4582 573 573
9164 573 573


### Output categories

In [23]:
categories = list(set(original['30d_unplan_readmit'].values))
print(categories)

['N', 'Y']


In [24]:
pd.DataFrame(categories).to_csv(READMISSION_PREDICTION/'labels.csv', sep=',', index = False,header=False)

## EDA

In [25]:
EDA = READMISSION_PREDICTION/model/'original_eda/'

EDA.mkdir(parents=True,exist_ok=True)

In [26]:
from eda import eda

In [27]:
alpha = 0.1
num_aug = 1 # number of augmented sentences per original sentence
aug=[]

for i in range(len(original_train)):
    sentence = original_train['text'][i]
    aug_sentences = eda.eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
    aug.append(aug_sentences)

In [28]:
aug_text = pd.Series((v[0] for v in aug))
original_eda = original_train
original_eda['text']=aug_text

In [29]:
eda = original_train.append(original_eda, ignore_index=True)

In [30]:
split_dataset(eda, EDA)

9164 573 573
