# Preprocess

Get data into the right format. Retrieve test dataset from earlier which is a superset of the phenotype data (annotations.csv). Create datasets for the original data, synthetic data as well as combined together. Lastly also an original 2x

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
DATA = Path('data/')
PHENOTYPE_CLASSIFICATION = DATA/'phenotype_classification/'
PHENOTYPE_CLASSIFICATION.mkdir(exist_ok=True)

data = pd.read_csv(DATA/'preprocessed/ref_test.tsv', sep='\t') # only test file
data = data[['text','hadm_id']]
annotations = pd.read_csv(DATA/'annotations.csv', sep=',')
with open(DATA/'t2t_experiments/full_context/output/transformer_decoded/tgt-test.001.txt', 'r') as f:
    synthetic_data = f.readlines()
synthetic_data=pd.DataFrame({'text':synthetic_data})

In [3]:
annotations.columns.values[0] = 'hadm_id'
annotations.columns.values[1] = 'subject_id'

pheno_admissions = list(set(annotations['hadm_id']))
len(pheno_admissions)

1561

In [4]:
print(len(data))
original = data[data['hadm_id'].isin(pheno_admissions)]
len(original)

5727


1846

In [5]:
print(len(synthetic_data))
synthetic_data['hadm_id'] = data['hadm_id']
synthetic = synthetic_data[synthetic_data['hadm_id'].isin(pheno_admissions)]
len(synthetic)

5727


1846

In [6]:
annotations = annotations.drop_duplicates(keep="last")

In [7]:
original=original.merge(annotations, left_on='hadm_id', right_on='hadm_id')
synthetic=synthetic.merge(annotations, left_on='hadm_id', right_on='hadm_id')

In [8]:
print(len(synthetic))
print(len(original))

1875
1875


In [9]:
original = original.drop(['hadm_id','subject_id','chart.time','cohort'], axis=1)
synthetic = synthetic.drop(['hadm_id','subject_id','chart.time','cohort'], axis=1)

In [10]:
original_val_test=original.sample(frac=0.2, replace=False, random_state=1) # 80% train
excluded = list(original_val_test.index.values) 
original_train = original[~original.index.isin(excluded)]
len(original_train)

1500

In [11]:
synthetic_val_test=synthetic.sample(frac=0.2, replace=False, random_state=1) # 80% train
excluded = list(synthetic_val_test.index.values) 
synthetic_train = synthetic[~synthetic.index.isin(excluded)]
len(synthetic_train)

1500

In [12]:
combined = original_train.append(synthetic_train, ignore_index=True)
original_2x = original_train.append(original_train, ignore_index=True)

Split each dataset into train, test and eval and save to disk:

We need to ensure that the validation and test sets have never been seen before by the training set

In [13]:
model = 'transformer'

ORIGINAL = PHENOTYPE_CLASSIFICATION/model/'original/'
ORIGINAL_2X = PHENOTYPE_CLASSIFICATION/model/'original_2x/'
SYNTHETIC = PHENOTYPE_CLASSIFICATION/model/'synthetic/'
COMBINED = PHENOTYPE_CLASSIFICATION/model/'combined/'

ORIGINAL.mkdir(parents=True,exist_ok=True)
ORIGINAL_2X.mkdir(parents=True,exist_ok=True)
SYNTHETIC.mkdir(parents=True,exist_ok=True)
COMBINED.mkdir(parents=True,exist_ok=True)

In [14]:
original_val_test = original_val_test.reset_index(drop=True)
original_train = original_train.reset_index(drop=True)

split_idx_test = int(0.5 * len(original_val_test)) # 10% val, 10% test

val = original_val_test.loc[:split_idx_test]
test = original_val_test[split_idx_test:]

In [15]:
def split_dataset(train, directory):
    
    print (len(train), len(val), len(test))
    
    train.to_csv(directory/'train.csv', sep=',', index = False)
    val.to_csv(directory/'val.csv', sep=',', index = False)
    test.to_csv(directory/'test.csv', sep=',', index = False)
    
    return

In [16]:
split_dataset(original_train, ORIGINAL)
split_dataset(original_2x, ORIGINAL_2X)
split_dataset(synthetic_train, SYNTHETIC)
split_dataset(combined, COMBINED)

1500 188 188
3000 188 188
1500 188 188
3000 188 188


### Output categories

In [17]:
categories = sorted(synthetic)
categories.remove('text')
print(len(categories))
print(categories)

14
['Advanced.Cancer', 'Advanced.Heart.Disease', 'Advanced.Lung.Disease', 'Alcohol.Abuse', 'Chronic.Neurological.Dystrophies', 'Chronic.Pain.Fibromyalgia', 'Dementia', 'Depression', 'Developmental.Delay.Retardation', 'Non.Adherence', 'Obesity', 'Other.Substance.Abuse', 'Schizophrenia.and.other.Psychiatric.Disorders', 'Unsure']


In [18]:
pd.DataFrame(categories).to_csv(PHENOTYPE_CLASSIFICATION/'labels.csv', sep=',', index = False,header=False)

## EDA

In [26]:
EDA = PHENOTYPE_CLASSIFICATION/model/'original_eda/'

EDA.mkdir(parents=True,exist_ok=True)

In [20]:
from eda import eda

In [21]:
alpha = 0.1
num_aug = 1 # number of augmented sentences per original sentence
aug=[]

for i in range(len(original_train)):
    sentence = original_train['text'][i]
    aug_sentences = eda.eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
    aug.append(aug_sentences)

In [22]:
aug_text = pd.Series((v[0] for v in aug))
original_eda = original_train
original_eda['text']=aug_text

In [24]:
eda = original_train.append(original_eda, ignore_index=True)

In [27]:
split_dataset(eda, EDA)

3000 188 188
