# Preprocess

This notebook will create the necessary input files before we can start training our model

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import psycopg2
from sqlalchemy import create_engine 
import string
import spacy
import re
from datetime import date, datetime, timedelta
import random
from sklearn.model_selection import GroupShuffleSplit, StratifiedShuffleSplit

gpu_access = torch.cuda.is_available()

In [2]:
# show the csv files that correspond to tables in the database

!ls /mimic/data/

ADMISSIONS.csv	     D_ICD_PROCEDURES.csv    PATIENTS.csv
CALLOUT.csv	     D_ITEMS.csv	     PRESCRIPTIONS.csv
CAREGIVERS.csv	     D_LABITEMS.csv	     PROCEDUREEVENTS_MV.csv
CHARTEVENTS.csv      ICUSTAYS.csv	     PROCEDURES_ICD.csv
CPTEVENTS.csv	     INPUTEVENTS_CV.csv      SERVICES.csv
DATETIMEEVENTS.csv   INPUTEVENTS_MV.csv      TRANSFERS.csv
DIAGNOSES_ICD.csv    LABEVENTS.csv	     preprocessed
DRGCODES.csv	     MICROBIOLOGYEVENTS.csv  robots.txt.tmp
D_CPT.csv	     NOTEEVENTS.csv
D_ICD_DIAGNOSES.csv  OUTPUTEVENTS.csv


In [3]:
# connect to the mimic database and set the search path to the 'mimiciii' schema

dbschema='mimiciii'
cnx = create_engine('postgresql+psycopg2://aa5118:mimic@localhost:5432/mimic',
                    connect_args={'options': '-csearch_path={}'.format(dbschema)})


In [4]:
# breakdown of note categories showing the number of notes, average number of characters
# and the number of notes in each category where the note time was provided

df_summary = pd.read_sql_query('''
  SELECT 
      category,
      COUNT(category),
      ROUND(AVG(LENGTH(text)),-1) AS text_avg_chars,
      COUNT(charttime) AS time_provided
  FROM noteevents
  GROUP BY category
''', cnx)
df_summary

Unnamed: 0,category,count,text_avg_chars,time_provided
0,Case Management,967,1120.0,967
1,Consult,98,6040.0,98
2,Discharge summary,59652,9620.0,0
3,ECG,209051,210.0,0
4,Echo,45794,2320.0,0
5,General,8301,1560.0,8260
6,Nursing,223556,1790.0,222172
7,Nursing/other,822497,800.0,822497
8,Nutrition,9418,2430.0,9411
9,Pharmacy,103,2580.0,102


In [5]:
# confirming that the dataframe output should have 0 rows

blah = "'Discharge summary'"
df_temp = pd.read_sql_query('''
  SELECT charttime FROM noteevents WHERE category = ''' + blah + ''' AND charttime IS NOT NULL
''', cnx)
df_temp

Unnamed: 0,charttime


In [6]:
# main dataframe - join 'patients' to 'noteevents' and only look at adults (>=15yo)

df_main = pd.read_sql_query('''
  SELECT
      p.subject_id, p.dob, p.gender,
      n.category, n.chartdate, n.row_id, n.charttime,
      ROUND((cast(chartdate as date) - cast(dob as date)) / 365.242,0)
          AS age_at_noteevent,
      n.text
  FROM patients p 
  INNER JOIN noteevents n 
  ON p.subject_id = n.subject_id
  WHERE ROUND((cast(chartdate as date) - cast(dob as date)) / 365.242,0) > 14
  ORDER BY subject_id
  --LIMIT 10000;
''', cnx)
print(df_main.shape)
df_main.head()

(1657776, 9)


Unnamed: 0,subject_id,dob,gender,category,chartdate,row_id,charttime,age_at_noteevent,text
0,3,2025-04-11,M,Radiology,2101-10-15,768442,2101-10-15 13:59:00,77.0,[**2101-10-15**] 1:59 PM\n CHEST (PORTABLE AP)...
1,3,2025-04-11,M,Radiology,2101-10-06,767724,2101-10-06 18:02:00,76.0,[**2101-10-6**] 6:02 PM\n CHEST (PORTABLE AP) ...
2,3,2025-04-11,M,Nursing/other,2101-10-22,1260688,2101-10-22 04:36:00,77.0,Resp. Care Note\nPt intubated and vented on se...
3,3,2025-04-11,M,Nursing/other,2101-10-24,1260696,2101-10-24 05:53:00,77.0,MICU NPN 7PM-7AM:\nNeuro: Pt is sleeping most ...
4,3,2025-04-11,M,Nursing/other,2101-10-21,1260685,2101-10-21 14:27:00,77.0,MICU NSG PROG NOTE: days\nRemains stable on hi...


In [7]:
# function to preprocess the text from the 'noteevents' table and tokenise using the spaCy tokenizer

nlp = spacy.load('en')

counter = 0
def tokenise_text(text):
    global counter
    
    text = re.sub(r'([0-9])-([0-9][0-9]?)-([0-9])',r'\1/\2/\3',text)
    text = text.replace("[**","[").replace("**]","]")
    
    #text = text.lower()
    tokens = nlp.tokenizer(text)
    tokenised_text = ""
    
    for token in tokens:
        tokenised_text = tokenised_text + str(token) + " "
    
    tokenised_text = tokenised_text.replace("\n"," <par> ").replace("\r"," <par> ")
    tokenised_text = ' '.join(tokenised_text.split())
    
    counter += 1
    if (counter % 10000) == 0:
        print (counter)
    
    return tokenised_text

In [8]:
# apply tokenising function

df_main["text"] = df_main["text"].apply(tokenise_text)
df_main.head()

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
1390

Unnamed: 0,subject_id,dob,gender,category,chartdate,row_id,charttime,age_at_noteevent,text
0,3,2025-04-11,M,Radiology,2101-10-15,768442,2101-10-15 13:59:00,77.0,[ 2101/10/15 ] 1:59 PM <par> CHEST ( PORTABLE ...
1,3,2025-04-11,M,Radiology,2101-10-06,767724,2101-10-06 18:02:00,76.0,[ 2101/10/6 ] 6:02 PM <par> CHEST ( PORTABLE A...
2,3,2025-04-11,M,Nursing/other,2101-10-22,1260688,2101-10-22 04:36:00,77.0,Resp . Care Note <par> Pt intubated and vented...
3,3,2025-04-11,M,Nursing/other,2101-10-24,1260696,2101-10-24 05:53:00,77.0,MICU NPN 7PM-7AM : <par> Neuro : Pt is sleepin...
4,3,2025-04-11,M,Nursing/other,2101-10-21,1260685,2101-10-21 14:27:00,77.0,MICU NSG PROG NOTE : days <par> Remains stable...


In [None]:
# use the first n tokens of the text as a hint
counter = 0
def produce_hint(text):
    global counter
    l = text.split()
    counter += 1
    if (counter % 10000) == 0:
        print (counter)
    return ' '.join(l[:10]) # first 10 tokens

df_main['hint'] = df_main['text'].map(lambda x: produce_hint(x))
print(df_main.shape)
df_main.head()

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
1390

Unnamed: 0,subject_id,dob,gender,category,chartdate,row_id,charttime,age_at_noteevent,text,hint
0,3,2025-04-11,M,Radiology,2101-10-15,768442,2101-10-15 13:59:00,77.0,[ 2101/10/15 ] 1:59 PM <par> CHEST ( PORTABLE ...,[ 2101/10/15 ] 1:59 PM <par> CHEST ( PORTABLE AP
1,3,2025-04-11,M,Radiology,2101-10-06,767724,2101-10-06 18:02:00,76.0,[ 2101/10/6 ] 6:02 PM <par> CHEST ( PORTABLE A...,[ 2101/10/6 ] 6:02 PM <par> CHEST ( PORTABLE AP
2,3,2025-04-11,M,Nursing/other,2101-10-22,1260688,2101-10-22 04:36:00,77.0,Resp . Care Note <par> Pt intubated and vented...,Resp . Care Note <par> Pt intubated and vented on
3,3,2025-04-11,M,Nursing/other,2101-10-24,1260696,2101-10-24 05:53:00,77.0,MICU NPN 7PM-7AM : <par> Neuro : Pt is sleepin...,MICU NPN 7PM-7AM : <par> Neuro : Pt is sleeping
4,3,2025-04-11,M,Nursing/other,2101-10-21,1260685,2101-10-21 14:27:00,77.0,MICU NSG PROG NOTE : days <par> Remains stable...,MICU NSG PROG NOTE : days <par> Remains stable on


In [None]:
# patients above 89 years of age had their dob modified to be 300 years old at time of first event for privacy reasons
# change their age to instead be 90

df_main.loc[df_main['age_at_noteevent'] > 200, 'age_at_noteevent'] = 90

In [None]:
# Split the dataset in a grouped and stratified manner

def StratifiedGroupShuffleSplit(df_main):

    df_main = df_main.reindex(np.random.permutation(df_main.index)) # shuffle dataset

    # create empty train, val and test datasets
    df_train = pd.DataFrame()
    df_val = pd.DataFrame()
    df_test = pd.DataFrame()

    hparam_mse_wgt = 0.1 # must be between 0 and 1
    assert(0 <= hparam_mse_wgt <= 1)
    train_proportion = 0.8 # must be between 0 and 1
    assert(0 <= train_proportion <= 1)
    val_test_proportion = (1-train_proportion)/2

    #subject_grouped_df = pd.concat([g for _, g in df_main.groupby(['subject_id'], sort=False, as_index=False)], ignore_index=True)
    subject_grouped_df_main = df_main.groupby(['subject_id'], sort=False, as_index=False)
    category_grouped_df_main = df_main.groupby('category').count()[['subject_id']]/len(df_main)*100

    def calc_mse_loss(df):
        grouped_df = df.groupby('category').count()[['subject_id']]/len(df)*100
        df_temp = category_grouped_df_main.join(grouped_df, on = 'category', how = 'left', lsuffix = '_main')
        df_temp.fillna(0, inplace=True)
        df_temp['diff'] = (df_temp['subject_id_main'] - df_temp['subject_id'])**2
        mse_loss = np.mean(df_temp['diff'])
        return mse_loss
    
    len_train = 0
    len_val = 0
    len_test = 0
    total_records = 0
    i = 0
    for _, group in subject_grouped_df_main:
        
        total_records = len_train + len_val + len_test
        g = pd.DataFrame(group)
        i += 1
        
        if (i < 4):
            if (i == 1):
                df_train = df_train.append(g, ignore_index=True)
                len_train += len(g)
                continue
            elif (i == 2):
                df_val = df_val.append(g, ignore_index=True)
                len_val += len(g)
                continue
            else:
                df_test = df_test.append(g, ignore_index=True)
                len_test += len(g)
                continue
        
        if (i % 500 != 0):
            
            if (train_proportion > (len_train/total_records)):
                df_train = df_train.append(g, ignore_index=True)
                len_train += len(g)
            elif (val_test_proportion > (len_val/total_records)):
                df_val = df_val.append(g, ignore_index=True)
                len_val += len(g)
            else:
                df_test = df_test.append(g, ignore_index=True)
                len_test += len(g)
        else :
            
            mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss(df_train.append(g, ignore_index=True))
            mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss(df_val.append(g, ignore_index=True))
            mse_loss_diff_test = calc_mse_loss(df_test) - calc_mse_loss(df_test.append(g, ignore_index=True))

            len_diff_train = (train_proportion - (len_train/total_records))
            len_diff_val = (val_test_proportion - (len_val/total_records))
            len_diff_test = (val_test_proportion - (len_test/total_records)) 

            len_loss_diff_train = len_diff_train * abs(len_diff_train)
            len_loss_diff_val = len_diff_val * abs(len_diff_val)
            len_loss_diff_test = len_diff_test * abs(len_diff_test)

            loss_train = (hparam_mse_wgt * mse_loss_diff_train) + ((1-hparam_mse_wgt) * len_loss_diff_train)
            loss_val = (hparam_mse_wgt * mse_loss_diff_val) + ((1-hparam_mse_wgt) * len_loss_diff_val)
            loss_test = (hparam_mse_wgt * mse_loss_diff_test) + ((1-hparam_mse_wgt) * len_loss_diff_test)

            if (max(loss_train,loss_val,loss_test) == loss_train):
                df_train = df_train.append(g, ignore_index=True)
                len_train += len(g)
            elif (max(loss_train,loss_val,loss_test) == loss_val):
                df_val = df_val.append(g, ignore_index=True)
                len_val += len(g)
            else:
                df_test = df_test.append(g, ignore_index=True)
                len_test += len(g)
            
            print ("Group " + str(i) + ". loss_train: " + str(loss_train) + " | " + "loss_val: " + str(loss_val) + " | " + "loss_test: " + str(loss_test) + " | ")
        
        if (i % 100 == 0 & i %1000 != 0):
            print ("Group " + str(i))
            
    return df_train, df_val, df_test

src_train, src_val, src_test = StratifiedGroupShuffleSplit(df_main)

Group 500. loss_train: 0.027640069201780113 | loss_val: 0.13381721657474313 | loss_test: 0.12445111385619839 | 
Group 1000. loss_train: 0.01350923114040861 | loss_val: 0.03312332620990509 | loss_test: 0.03240166600973489 | 
Group 1500. loss_train: -0.0044589494895938 | loss_val: -0.014220263683998952 | loss_test: -0.011229096139819834 | 
Group 2000. loss_train: -0.005815651045035814 | loss_val: -0.018237003050632736 | loss_test: -0.01429010167491157 | 
Group 2500. loss_train: 0.0035240653813290476 | loss_val: 0.014032928210995377 | loss_test: 0.007416240565002041 | 
Group 3000. loss_train: 0.0007004353970999297 | loss_val: 0.004157681908064318 | loss_test: 0.0014213532946655313 | 
Group 3500. loss_train: 0.0009494846828195252 | loss_val: 0.0034881720080541757 | loss_test: 0.0027326355436606423 | 
Group 4000. loss_train: -0.005159161275861175 | loss_val: -0.0177335259190913 | loss_test: -0.014698573124395974 | 
Group 4500. loss_train: -0.00015324673093914943 | loss_val: -0.0011632706410

Group 35000. loss_train: 4.6851877857046184e-06 | loss_val: 2.5629172431410382e-06 | loss_test: 8.32291462917972e-06 | 
Group 35500. loss_train: 1.7907517698753272e-06 | loss_val: 1.8877702969357175e-07 | loss_test: 2.718261302378301e-06 | 
Group 36000. loss_train: 1.6879942892338672e-06 | loss_val: 6.671843399417533e-07 | loss_test: 1.5107210355696891e-06 | 
Group 36500. loss_train: 1.0894659295898228e-06 | loss_val: 9.173750279869276e-06 | loss_test: -5.497850186682421e-06 | 
Group 37000. loss_train: 1.1116816909939247e-06 | loss_val: -2.4944049858227417e-06 | loss_test: 6.837285388109242e-07 | 
Group 37500. loss_train: 1.7696305908256723e-07 | loss_val: 8.073800597345831e-06 | loss_test: -6.053958701800663e-06 | 
Group 38000. loss_train: -9.8498614722493e-08 | loss_val: 4.402115527340653e-06 | loss_test: -2.705834503785823e-06 | 


In [None]:
df = src_test #  change to src_train/src_test/src_val to inspect length and stratification
print (len(df))
category_grouped_df_main = df_main.groupby('category').count()[['subject_id']]/len(df_main)*100
grouped_df = df.groupby('category').count()[['subject_id']]/len(df)*100
df_temp = category_grouped_df_main.join(grouped_df, on = 'category', how = 'left', lsuffix = '_main')
df_temp.fillna(0, inplace=True)
df_temp

331556


Unnamed: 0_level_0,subject_id_main,subject_id
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Case Management,0.058331,0.049765
Consult,0.005912,0.002714
Discharge summary,3.342068,3.346644
ECG,12.571904,12.755613
Echo,2.689748,2.71749
General,0.500731,0.475636
Nursing,13.485296,13.090398
Nursing/other,25.218365,25.725368
Nutrition,0.568111,0.54772
Pharmacy,0.006213,0.004223


In [None]:
tgt_train = pd.DataFrame(src_train, columns = ["text"])
tgt_val = pd.DataFrame(src_val, columns = ["text"])
tgt_test = pd.DataFrame(src_test, columns = ["text"])
print(tgt_test.shape)
tgt_test.head()

(331556, 1)


Unnamed: 0,text
0,"Shock , cardiogenic <par> Assessment : <par> R..."
1,This is a 57 yr old female who was admitted to...
2,This is a 57 yr old female transfered from [ H...
3,TITLE : <par> Chief Complaint : <par> 24 Hour ...
4,This is a 57 yr old female who was admitted to...


# Old code for splitting dataset
----------------------------------------------------------------------------------------

## Split the dataframe into Training, Validation and Test datasets

### Separate data columns into x (input features) and y (output)
src = df_main.to_numpy()
tgt = df_main['text'].to_numpy()

split_idx_train = int(0.8 * len(df_main)) # 80% training
split_idx_val = int(0.9 * len(df_main)) # 10% eval, 10% test

### Split data by rows into a training set and a validation set
src_train = pd.DataFrame(src[:split_idx_train,:], columns=df_main.columns)
src_val = pd.DataFrame(src[split_idx_train:split_idx_val,:], columns=df_main.columns)
src_test = pd.DataFrame(src[split_idx_val:,:], columns=df_main.columns)

tgt_train = pd.DataFrame(tgt[:split_idx_train], columns = ["text"])
tgt_val = pd.DataFrame(tgt[split_idx_train:split_idx_val], columns = ["text"])
tgt_test = pd.DataFrame(tgt[split_idx_val:], columns = ["text"])

In [None]:
# lab items data

df_labitems = pd.read_sql_query('''
  SELECT l.subject_id, l.charttime, l.value, l.valueuom, l.flag, d.label
  FROM labevents l
  INNER JOIN d_labitems d 
  USING (itemid)
  --LIMIT 20;
''', cnx)
print(df_labitems.shape)
df_labitems.head()

(27854055, 6)


Unnamed: 0,subject_id,charttime,value,valueuom,flag,label
0,3,2101-10-12 16:07:00,7.39,units,,pH
1,3,2101-10-12 18:17:00,ART,,,SPECIMEN TYPE
2,3,2101-10-12 18:17:00,-1,mEq/L,,Base Excess
3,3,2101-10-12 18:17:00,22,mEq/L,,Calculated Total CO2
4,3,2101-10-12 18:17:00,0.93,mmol/L,abnormal,Free Calcium


In [None]:
# prescriptions data

df_prescriptions = pd.read_sql_query('''
  SELECT subject_id, startdate, enddate, drug, prod_strength
  FROM prescriptions
  --LIMIT 20;
''', cnx)
print(df_prescriptions.shape)
df_prescriptions.head()

(4156450, 5)


Unnamed: 0,subject_id,startdate,enddate,drug,prod_strength
0,6,2175-06-11,2175-06-12,Tacrolimus,1mg Capsule
1,6,2175-06-11,2175-06-12,Warfarin,5mg Tablet
2,6,2175-06-11,2175-06-12,Heparin Sodium,"25,000 unit Premix Bag"
3,6,2175-06-11,2175-06-12,D5W,HEPARIN BASE
4,6,2175-06-11,2175-06-12,Furosemide,20mg Tablet


In [None]:
def create_file(df, filename):

    f= open(filename,"w+")
    length = len(df)
    for i, row in enumerate(df.itertuples()):
        subject_id = row[1]
        charttime = row[7]
        chartdate = datetime.combine(row[5], datetime.min.time())
        category = str(row[4])

        if (pd.isna(charttime)):
            if (category == "Discharge summary"):
                cutoff = chartdate
                chartdate = cutoff + timedelta(days=1)
            else:
                cutoff = chartdate - timedelta(days=1)

            lab_items = df_labitems[(df_labitems.subject_id == subject_id) & 
                                    (df_labitems.charttime >= cutoff) &
                                    (df_labitems.charttime < chartdate)]

        else:
            cutoff = charttime - timedelta(days=1)
            lab_items = df_labitems[(df_labitems.subject_id == subject_id) & 
                                    (df_labitems.charttime >= cutoff) &
                                    (df_labitems.charttime < charttime)]

        prescriptions = df_prescriptions[(df_prescriptions.subject_id == subject_id) & 
                                        (df_prescriptions.startdate >= cutoff) &
                                        (df_prescriptions.startdate < chartdate)]

        lab_items_list = ""
        lab_items_length = len(lab_items)
        if (lab_items_length > 0):
            for j, lab_row in enumerate(lab_items.itertuples()):
                flag = ""
                if (pd.isna(lab_row[5]) == False):
                    flag = " , " + str(lab_row[5])

                lab_items_list += str(lab_row[6]) + " , " + str(lab_row[3]) + " , " + str(lab_row[4]) + flag
                if (j != (lab_items_length - 1)):
                    lab_items_list += " | "

        prescriptions_list = ""
        prescriptions_length = len(prescriptions)
        if (prescriptions_length > 0):
            for j, pre_row in enumerate(prescriptions.itertuples()):
                prescriptions_list += str(pre_row[4]) + " , " + str(pre_row[5])
                if (j != (prescriptions_length - 1)):
                    prescriptions_list += " | "

        f.write(str(row[10]) + " <H> " + str(row[4]) + " <T> " + str(row[3]) + " <G> " + str(row[8]) + " <A> " + 
                prescriptions_list + " <0> " + lab_items_list + " <1>" + "\n")

        if ((i+1) % 10000 == 0):
            print ("{0:.0f}%".format((i+1)*100/length))

    f.close()

In [None]:
# save source files to disk

create_file(src_train, "/mimic/data/preprocessed/src-train.txt")
create_file(src_val, "/mimic/data/preprocessed/src-val.txt")
create_file(src_test, "/mimic/data/preprocessed/src-test.txt")

0%
0%
0%
0%
1%
1%
1%
1%
1%
1%
1%
1%
1%
1%
2%
2%
2%
2%
2%
2%
2%
2%
2%
2%
3%
3%
3%
3%
3%
3%
3%
3%
3%
3%
4%
4%
4%
4%
4%
4%
4%
4%
4%
4%
5%
5%
5%
5%
5%
5%
5%
5%
5%
5%
6%
6%
6%
6%
6%
6%
6%
6%
6%
7%
7%
7%
7%
7%
7%
7%
7%
7%
7%
8%
8%
8%
8%
8%
8%
8%
8%
8%
8%
9%
9%
9%
9%
9%
9%
9%
9%
9%
9%
10%
10%
10%
10%
10%
10%
10%
10%
10%
10%
11%
11%
11%
11%
11%
11%
11%
11%
11%
11%
12%
12%
12%
12%
12%
12%
12%
12%
12%
12%
13%
13%
13%
13%
13%
13%
13%
13%
13%
13%
14%
14%
14%
14%
14%
14%
14%
14%
14%
14%
15%
15%
15%
15%
15%
15%
15%
15%
15%
15%
16%
16%
16%
16%
16%
16%
16%
16%
16%
16%
17%
17%
17%
17%
17%
17%
17%
17%
17%
17%
18%
18%
18%
18%
18%
18%
18%
18%
18%
18%
19%
19%
19%
19%
19%
19%
19%
19%
19%
20%
20%
20%
20%
20%
20%
20%
20%
20%
20%
21%
21%
21%
21%
21%
21%
21%
21%
21%
21%
22%
22%
22%
22%
22%
22%
22%
22%
22%
22%
23%
23%
23%
23%
23%
23%
23%
23%
23%
23%
24%
24%
24%
24%
24%
24%
24%
24%
24%
24%
25%
25%
25%
25%
25%
25%
25%
25%
25%
25%
26%
26%
26%
26%
26%
26%
26%
26%
26%
26%
27%
27%
27%
27%
27%
27%
27%
27%
27%
28%
28%
2

In [None]:
# save target files to disk

np.savetxt('/mimic/data/preprocessed/tgt-train.txt', tgt_train, fmt='%s', newline=os.linesep)
np.savetxt('/mimic/data/preprocessed/tgt-val.txt', tgt_val, fmt='%s', newline=os.linesep)
np.savetxt('/mimic/data/preprocessed/tgt-test.txt', tgt_test, fmt='%s', newline=os.linesep)