# Preprocess

Get data into the right format. Create datasets for the original data, synthetic data as well as combined together. Lastly also an original 2x

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
#model = 'transformer'
model = 'gpt2'

DATA = Path('data/')

#SYNTHETIC_DATA=DATA/'t2t_experiments/full_context/output/transformer_decoded/tgt-test.001.txt'
SYNTHETIC_DATA=DATA/'gpt2/test-output-text.txt'

READMISSION_PREDICTION = DATA/'readmission_prediction/'
READMISSION_PREDICTION.mkdir(exist_ok=True)

In [3]:
data = pd.read_csv(DATA/'preprocessed/ref_test.tsv', sep='\t') # only test file
original = data[['text','30d_unplan_readmit']]

with open(SYNTHETIC_DATA, 'r') as f:
    synthetic = f.readlines()
synthetic=pd.DataFrame({'text':synthetic})
synthetic['30d_unplan_readmit'] = original['30d_unplan_readmit']

In [4]:
print(len(synthetic))
print(len(original))

5727
5727


#### Stratified splitting

The proportion of positive cases of readmittance is very low so we must ensure that they are equally represented in the training, validation and test splits

In [5]:
original_train, original_val_test, y_train, y_test = train_test_split(original[['text']], original[['30d_unplan_readmit']],
                                                    stratify=original[['30d_unplan_readmit']], 
                                                    test_size=0.2)

original_train = pd.concat([original_train, y_train], axis=1)
original_val_test = pd.concat([original_val_test, y_test], axis=1)
original_train=original_train.reset_index(drop=True)

In [6]:
val, test, y_val, y_test = train_test_split(original_val_test[['text']], original_val_test[['30d_unplan_readmit']],
                                                    stratify=original_val_test[['30d_unplan_readmit']], 
                                                    test_size=0.5)

val = pd.concat([val, y_val], axis=1)
test = pd.concat([test, y_test], axis=1)

In [7]:
excluded = list(original_val_test.index.values) 
synthetic_train = synthetic[~synthetic.index.isin(excluded)]
len(synthetic_train)

4581

In [8]:
print(len(original_train),len(synthetic_train),len(val),len(test))

4581 4581 573 573


#### Upsampling

The dataset is very imbalanced so we will upsample the positive samples. First the original:

In [9]:
positives = original_train[original_train['30d_unplan_readmit'] == 'Y']
negatives = original_train[original_train['30d_unplan_readmit'] == 'N']
print(len(positives))
print(len(negatives))

582
3999


In [10]:
upsampled_positives=positives

while len(upsampled_positives) < len(negatives):
    for i, row in positives.iterrows():
        upsampled_positives = upsampled_positives.append(
                                {'text':row['text'],'30d_unplan_readmit': row['30d_unplan_readmit']}, ignore_index=True)
        if (len(upsampled_positives) == len(negatives)):
            break
    print ("loop")
    
original_upsampled = upsampled_positives.append(negatives, ignore_index=True)
print(len(original_upsampled[original_upsampled['30d_unplan_readmit'] == 'Y']))
print(len(original_upsampled[original_upsampled['30d_unplan_readmit'] == 'N']))

loop
loop
loop
loop
loop
loop
3999
3999


Then the synthetic:

In [11]:
positives = synthetic_train[synthetic_train['30d_unplan_readmit'] == 'Y']
negatives = synthetic_train[synthetic_train['30d_unplan_readmit'] == 'N']
print(len(positives))
print(len(negatives))

582
3999


In [12]:
upsampled_positives=positives

while len(upsampled_positives) < len(negatives):
    for i, row in positives.iterrows():
        upsampled_positives = upsampled_positives.append(
                                {'text':row['text'],'30d_unplan_readmit': row['30d_unplan_readmit']}, ignore_index=True)
        if (len(upsampled_positives) == len(negatives)):
            break
    print ("loop")
    
synthetic_upsampled = upsampled_positives.append(negatives, ignore_index=True)
print(len(synthetic_upsampled[synthetic_upsampled['30d_unplan_readmit'] == 'Y']))
print(len(synthetic_upsampled[synthetic_upsampled['30d_unplan_readmit'] == 'N']))

loop
loop
loop
loop
loop
loop
3999
3999


Now we combine them in different combinations:

In [13]:
combined = original_upsampled.append(synthetic_upsampled, ignore_index=True)
original_2x = original_upsampled.append(original_upsampled, ignore_index=True)

#### Final dataset split

Split each dataset into train, test and eval and save to disk:

We need to ensure that the validation and test sets have never been seen before by the training set

In [14]:
ORIGINAL = READMISSION_PREDICTION/model/'original/'
ORIGINAL_2X = READMISSION_PREDICTION/model/'original_2x/'
SYNTHETIC = READMISSION_PREDICTION/model/'synthetic/'
COMBINED = READMISSION_PREDICTION/model/'combined/'

In [15]:
def split_dataset(train, directory):
    
    directory.mkdir(parents=True,exist_ok=True)
    
    print (len(train), len(val), len(test))
    
    train = train.sample(frac=1).reset_index(drop=True) # shuffle dataset
    
    train.to_csv(directory/'train.csv', sep=',', index = False)
    val.to_csv(directory/'val.csv', sep=',', index = False)
    test.to_csv(directory/'test.csv', sep=',', index = False)
    
    return

Below we actually call the `split_dataset()` function for each of the datasets we want. If we have already run this notebook once, we can comment out `original` and `original_2x` because these will exactly the same for each model. Similarly we don't need to run the EDA section further down

In [16]:
#split_dataset(original_upsampled, ORIGINAL)
#split_dataset(original_2x, ORIGINAL_2X)
split_dataset(synthetic_upsampled, SYNTHETIC)
split_dataset(combined, COMBINED)

7998 573 573
15996 573 573


### Output categories

In [17]:
categories = list(set(original_train['30d_unplan_readmit'].values))
categories.sort(reverse=True)
print(categories)

['Y', 'N']


In [18]:
pd.DataFrame(categories).to_csv(READMISSION_PREDICTION/'labels.csv', sep=',', index = False,header=False)

## EDA

In [15]:
EDA = READMISSION_PREDICTION/model/'original_eda/'

EDA.mkdir(parents=True,exist_ok=True)

In [16]:
from eda import eda

In [17]:
alpha = 0.1
num_aug = 1 # number of augmented sentences per original sentence
aug=[]

for i in range(len(original_train)):
    sentence = original_train['text'][i]
    aug_sentences = eda.eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
    aug.append(aug_sentences)

In [18]:
aug_text = pd.Series((v[0] for v in aug))
original_eda = original_train
original_eda['text']=aug_text

In [21]:
positives = original_eda[original_eda['30d_unplan_readmit'] == 'Y']
negatives = original_eda[original_eda['30d_unplan_readmit'] == 'N']
print(len(positives))
print(len(negatives))

582
3999


In [22]:
upsampled_positives=positives

while len(upsampled_positives) < len(negatives):
    for i, row in positives.iterrows():
        upsampled_positives = upsampled_positives.append(
                                {'text':row['text'],'30d_unplan_readmit': row['30d_unplan_readmit']}, ignore_index=True)
        if (len(upsampled_positives) == len(negatives)):
            break
    print ("loop")
    
original_eda = upsampled_positives.append(negatives, ignore_index=True)
print(len(original_eda[original_eda['30d_unplan_readmit'] == 'Y']))
print(len(original_eda[original_eda['30d_unplan_readmit'] == 'N']))

loop
loop
loop
loop
loop
loop
3999
3999


In [23]:
eda = original_upsampled.append(original_eda, ignore_index=True)

In [24]:
split_dataset(eda, EDA)

15996 573 573
