# Preprocess

Get data into the right format. Retrieve test dataset from earlier which is a superset of the phenotype data (annotations.csv). Create datasets for the original data, synthetic data as well as combined together. Lastly also an original 2x

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
DATA = Path('data/')
PHENOTYPE_CLASSIFICATION = DATA/'phenotype_classification/'
PHENOTYPE_CLASSIFICATION.mkdir(exist_ok=True)

data = pd.read_csv(DATA/'preprocessed/ref_test.tsv', sep='\t') # only test file
data = data[['text','hadm_id']]
annotations = pd.read_csv(DATA/'annotations.csv', sep=',')
with open(DATA/'t2t_experiments/full_context/output/transformer_decoded/tgt-test.001.txt', 'r') as f:
    synthetic_data = f.readlines()
synthetic_data=pd.DataFrame({'text':synthetic_data})

In [3]:
annotations.columns.values[0] = 'hadm_id'
annotations.columns.values[1] = 'subject_id'

pheno_admissions = list(set(annotations['hadm_id']))
len(pheno_admissions)

1561

In [4]:
print(len(data))
original = data[data['hadm_id'].isin(pheno_admissions)]
len(original)

5727


1846

In [5]:
print(len(synthetic_data))
synthetic_data['hadm_id'] = data['hadm_id']
synthetic = synthetic_data[synthetic_data['hadm_id'].isin(pheno_admissions)]
len(synthetic)

5727


1846

In [6]:
annotations = annotations.drop_duplicates(keep="last")

In [7]:
original=original.merge(annotations, left_on='hadm_id', right_on='hadm_id')
synthetic=synthetic.merge(annotations, left_on='hadm_id', right_on='hadm_id')

In [8]:
print(len(synthetic))
print(len(original))

1875
1875


In [9]:
original = original.drop(['hadm_id','subject_id','chart.time','cohort'], axis=1)
synthetic = synthetic.drop(['hadm_id','subject_id','chart.time','cohort'], axis=1)
combined = original.append(synthetic, ignore_index=True)
original_2x = original.append(original, ignore_index=True)

Split each dataset into train, test and eval and save to disk:

In [10]:
ORIGINAL = PHENOTYPE_CLASSIFICATION/'transformer/original/'
ORIGINAL_2X = PHENOTYPE_CLASSIFICATION/'transformer/original_2x/'
SYNTHETIC = PHENOTYPE_CLASSIFICATION/'transformer/synthetic/'
COMBINED = PHENOTYPE_CLASSIFICATION/'transformer/combined/'

ORIGINAL.mkdir(parents=True,exist_ok=True)
ORIGINAL_2X.mkdir(parents=True,exist_ok=True)
SYNTHETIC.mkdir(parents=True,exist_ok=True)
COMBINED.mkdir(parents=True,exist_ok=True)

In [11]:
def split_dataset(df, directory):
    split_idx_train = int(0.8 * len(df)) # 80% training
    split_idx_test = int(0.9 * len(df)) # 10% val, 10% test
    
    print(split_idx_train, split_idx_test, len(df))
    
    train = df.loc[:split_idx_train]
    val = df.loc[split_idx_train:split_idx_test]
    test = df.loc[split_idx_test:]
    
    train.to_csv(directory/'train.csv', sep=',', index = False)
    val.to_csv(directory/'val.csv', sep=',', index = False)
    test.to_csv(directory/'test.csv', sep=',', index = False)
    
    return

In [12]:
split_dataset(original, ORIGINAL)
split_dataset(original_2x, ORIGINAL_2X)
split_dataset(synthetic, SYNTHETIC)
split_dataset(combined, COMBINED)

1500 1687 1875
3000 3375 3750
1500 1687 1875
3000 3375 3750


### Output categories

In [13]:
categories = sorted(synthetic)
categories.remove('text')
print(len(categories))
print(categories)

14
['Advanced.Cancer', 'Advanced.Heart.Disease', 'Advanced.Lung.Disease', 'Alcohol.Abuse', 'Chronic.Neurological.Dystrophies', 'Chronic.Pain.Fibromyalgia', 'Dementia', 'Depression', 'Developmental.Delay.Retardation', 'Non.Adherence', 'Obesity', 'Other.Substance.Abuse', 'Schizophrenia.and.other.Psychiatric.Disorders', 'Unsure']


In [14]:
pd.DataFrame(categories).to_csv(PHENOTYPE_CLASSIFICATION/'categories.csv', sep=',', index = False,header=False)