# Preprocess

Get data into the right format. Retrieve test dataset from earlier which is a superset of the phenotype data (annotations.csv). Create datasets for the original data, synthetic data as well as combined together. Lastly also an original 2x

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
model = 'transformer'
#model = 'gpt2'

DATA = Path('data/')

SYNTHETIC_DATA=DATA/'t2t_experiments/transformer/low_resource/full_context/output/transformer_decoded/tgt-test.001.txt'
#SYNTHETIC_DATA=DATA/'gpt2/low_resource/test-output-text.txt'

PHENOTYPE_CLASSIFICATION = DATA/'phenotype_classification/low_resource'
PHENOTYPE_CLASSIFICATION.mkdir(exist_ok=True)

In [3]:
data = pd.read_csv(DATA/'preprocessed/low_resource/ref_test.tsv', sep='\t') # only test file
data = data[['text','hadm_id']]
annotations = pd.read_csv(DATA/'annotations.csv', sep=',')
with open(SYNTHETIC_DATA, 'r') as f:
    synthetic_data = f.readlines()
synthetic_data=pd.DataFrame({'text':synthetic_data})

In [4]:
annotations.rename(columns = {'Hospital.Admission.ID':'hadm_id'}, inplace = True)
annotations.rename(columns = {'subject.id':'subject_id'}, inplace = True)
print(len(annotations))

pheno_admissions = list(set(annotations['hadm_id']))
len(pheno_admissions)

1610


1561

In [5]:
print(len(data))
original = data[data['hadm_id'].isin(pheno_admissions)]
len(original)

1846


1846

In [6]:
print(len(synthetic_data))
synthetic_data['hadm_id'] = data['hadm_id']
synthetic = synthetic_data[synthetic_data['hadm_id'].isin(pheno_admissions)]
len(synthetic)

1846


1846

In [7]:
annotations = annotations.drop_duplicates(keep="last")

In [8]:
original=original.merge(annotations, left_on='hadm_id', right_on='hadm_id')
synthetic=synthetic.merge(annotations, left_on='hadm_id', right_on='hadm_id')

In [9]:
#if model == 'gpt2':
#    for index, row in synthetic.iterrows():
#        synthetic.at[index, 'text']=row['text'][1:-1]

In [10]:
print(len(synthetic))
print(len(original))

1875
1875


In [11]:
original = original.drop(['hadm_id','subject_id','chart.time','cohort'], axis=1)
synthetic = synthetic.drop(['hadm_id','subject_id','chart.time','cohort'], axis=1)

In [12]:
original_val_test=original.sample(frac=0.2, replace=False, random_state=1) # 80% train
excluded = list(original_val_test.index.values) 
original_train = original[~original.index.isin(excluded)]
len(original_train)

1500

In [13]:
synthetic_val_test=synthetic.sample(frac=0.2, replace=False, random_state=1) # 80% train
excluded = list(synthetic_val_test.index.values) 
synthetic_train = synthetic[~synthetic.index.isin(excluded)]
len(synthetic_train)

1500

In [14]:
combined = original_train.append(synthetic_train, ignore_index=True)
original_2x = original_train.append(original_train, ignore_index=True)

Split each dataset into train, test and eval and save to disk:

We need to ensure that the validation and test sets have never been seen before by the training set

In [15]:
ORIGINAL = PHENOTYPE_CLASSIFICATION/model/'original/'
ORIGINAL_2X = PHENOTYPE_CLASSIFICATION/model/'original_2x/'
SYNTHETIC = PHENOTYPE_CLASSIFICATION/model/'synthetic/'
COMBINED = PHENOTYPE_CLASSIFICATION/model/'combined/'

In [16]:
original_val_test = original_val_test.reset_index(drop=True)
original_train = original_train.reset_index(drop=True)

split_idx_test = int(0.5 * len(original_val_test)) # 10% val, 10% test

val = original_val_test.loc[:split_idx_test]
test = original_val_test[split_idx_test:]

In [17]:
def split_dataset(train, directory):
    
    directory.mkdir(parents=True,exist_ok=True)
    
    print (len(train), len(val), len(test))
    
    train = train.sample(frac=1).reset_index(drop=True) # shuffle dataset
    
    train.to_csv(directory/'train.csv', sep=',', index = False)
    val.to_csv(directory/'val.csv', sep=',', index = False)
    test.to_csv(directory/'test.csv', sep=',', index = False)
    
    return

Below we actually call the `split_dataset()` function for each of the datasets we want. If we have already run this notebook once, we can comment out `original` and `original_2x` because these will exactly the same for each model. Similarly we don't need to run the EDA section further down

In [18]:
#split_dataset(original_train, ORIGINAL)
#split_dataset(original_2x, ORIGINAL_2X)
split_dataset(synthetic_train, SYNTHETIC)
split_dataset(combined, COMBINED)

1500 188 188
3000 188 188


### Output categories

In [26]:
categories = sorted(synthetic)
categories.remove('text')
print(len(categories))
print(categories)

14
['Advanced.Cancer', 'Advanced.Heart.Disease', 'Advanced.Lung.Disease', 'Alcohol.Abuse', 'Chronic.Neurological.Dystrophies', 'Chronic.Pain.Fibromyalgia', 'Dementia', 'Depression', 'Developmental.Delay.Retardation', 'Non.Adherence', 'Obesity', 'Other.Substance.Abuse', 'Schizophrenia.and.other.Psychiatric.Disorders', 'Unsure']


In [27]:
pd.DataFrame(categories).to_csv(PHENOTYPE_CLASSIFICATION/'labels.csv', sep=',', index = False,header=False)

## EDA

In [28]:
EDA = PHENOTYPE_CLASSIFICATION/model/'original_eda/'

EDA.mkdir(parents=True,exist_ok=True)

In [29]:
from eda import eda

In [31]:
alpha = 0.1
num_aug = 1 # number of augmented sentences per original sentence
aug=[]

for i in range(len(original_train)):
    sentence = original_train['text'][i]
    aug_sentences = eda.eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
    aug.append(aug_sentences)

In [None]:
aug_text = pd.Series((v[0] for v in aug))
original_eda = original_train
original_eda['text']=aug_text

In [None]:
eda = original_train.append(original_eda, ignore_index=True)

In [None]:
split_dataset(eda, EDA)