## Modules + File Paths

In [None]:
import csv
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import pickle

In [None]:
folder_path = 'D:/Downloads/Results/Stage_6' #folder where the BRATECA V1 files were unzipped

admission_path = folder_path+'/B1_Admission.csv'
clinicalNote_path = folder_path+'/B1_ClinicalNote.csv'
exam_path = folder_path+'/B1_Exam.csv'
prescription_path = folder_path+'/B1_Prescription.csv'
prescriptionItem_path = folder_path+'/B1_PrescriptionItem.csv'

## ADMISSION

In [None]:
sc_set = set()
s_set = set()
first = True

with open(admission_path, encoding="utf-8") as ap:
    reader = csv.reader(ap)
    for row in reader:
        if first:
            first = False
            continue
        sc_set.add(row[5])
        s_set.add(row[6])

sc_dict = dict()
sc_columns = []
for i,sc in enumerate(sc_set):
    sc_dict[sc] = i
    sc_columns.append(sc)
    
s_dict = dict()
s_columns = []
for i,s in enumerate(s_set):
    s_dict[s] = i
    s_columns.append(s)

In [None]:
with open(admission_path, encoding="utf-8") as ap:
    reader = csv.reader(ap)
    all_admissions = []
    first = True
    for row in reader:
        if first:
            first = False
            continue
        hospitalID = row[0]
        admissionID = row[2]
        birth_date = datetime.strptime(row[3], '%d/%m/%Y')
        admission_date = datetime.strptime(row[4], '%d/%m/%Y %H:%M')
        discharge_date = datetime.strptime(row[9], '%d/%m/%Y %H:%M')
        if (discharge_date-admission_date) < timedelta(days=1): #ensure patient stays more than 24h
            continue
        age = admission_date.year - birth_date.year - ((admission_date.month, admission_date.day) < 
                                                       (birth_date.month, birth_date.day))
        age = int(age)
        if age > 90:
            age = 90
        skincolor_booleans = [0] * len(sc_dict)
        skincolor_booleans[sc_dict[row[5]]] = 1
        sex_booleans = [0] * len(s_dict)
        sex_booleans[s_dict[row[6]]] = 1
        discharge_reason = row[10]
        #organizing dataframe entry
        adm_data = [hospitalID,admissionID,age]
        adm_data.extend(skincolor_booleans)
        adm_data.extend(sex_booleans)
        adm_data.extend([admission_date])
        adm_data.extend([discharge_date])
        adm_data.extend([discharge_reason])
        all_admissions.append(adm_data)
    all_admissions = np.array(all_admissions)
    colms = ['Hospital ID','Admission ID','Age']
    colms.extend(sc_columns)
    colms.extend(s_columns)
    colms.extend(['Admission Date','Discharge Date','Discharge Reason'])
    admission_df = pd.DataFrame(all_admissions, columns = colms)

In [None]:
admission_df.drop('Sem Informação', inplace=True, axis=1)
admission_df['Age'] = admission_df['Age'].astype('int')
admission_df['Age'] = (admission_df['Age']-admission_df['Age'].min())/(admission_df['Age'].max()-admission_df['Age'].min())

In [None]:
all_admissions = None

## EXAM

In [None]:
en_set = set()
first = True

with open(exam_path, encoding="utf-8") as ap:
    reader = csv.reader(ap)
    for row in reader:
        if first:
            first = False
            continue
        exam_name = row[3].upper()
        exam_unit = row[6]
        if exam_unit == '' or exam_unit == ' ':
            exam_unit = ''
        else:
            exam_unit = ' - '+exam_unit
        f_exam = exam_name+exam_unit
        en_set.add(f_exam)

en_dict = dict()
en_columns = []
for i,en in enumerate(en_set):
    en_dict[en] = i
    en_columns.append(en)

In [None]:
measurements = dict()
with open(exam_path, encoding="utf-8") as ap:
    reader = csv.reader(ap)
    measurements = dict()
    first = 1
    for row in reader:
        if first:
            first = 0
            continue
        exam_name = row[3].upper()
        value = float(row[5])
        exam_unit = row[6]
        if exam_unit == '' or exam_unit == ' ':
            exam_unit = ''
        else:
            exam_unit = ' - '+exam_unit
        f_exam = exam_name+exam_unit
        if f_exam not in measurements.keys():
            measurements[f_exam] = []
        measurements[f_exam].append(value)
    for e in measurements.keys():
        array = np.array(measurements[e])
        msrmt = [array.max(), array.min(), array.mean(), array.std()]
        measurements[e] = msrmt

In [None]:
all_exams_minmax = []
with open(exam_path, encoding="utf-8") as ap:
    reader = csv.reader(ap)
    first = 1
    for row in reader:
        if first:
            first = 0
            continue
        hospitalID = row[0]
        admissionID = row[2]
        exam_name = row[3].upper()
        exam_unit = row[6]
        if exam_unit == '' or exam_unit == ' ':
            exam_unit = ''
        else:
            exam_unit = ' - '+exam_unit
        f_exam = exam_name+exam_unit
        
        exam_results = np.array(measurements[f_exam])
        value = float(row[5])
        if (exam_results[0]-exam_results[1]) != 0:
            value_minmax = (value - exam_results[1])/(exam_results[0]-exam_results[1])
        else:
            value_minmax = 1
    
        exam_booleans_minmax = [0] * len(en_set)
        exam_booleans_minmax[en_dict[f_exam]] = value_minmax
        
        examDate = datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S.%f')
        
        #organizing dataframe entry
        adm_data = [hospitalID,admissionID,examDate]
        adm_data.extend(exam_booleans_minmax)
        all_exams_minmax.append(adm_data)

    
    colms = ['Hospital ID','Admission ID','Exam Date']
    colms.extend(en_columns)

In [None]:
all_exams_minmax = np.array(all_exams_minmax)
exams_minmax_df = pd.DataFrame(all_exams_minmax, columns = colms)

In [None]:
adm_exam_df = pd.merge(admission_df, exams_minmax_df, on=['Hospital ID','Admission ID'], how='left')
adm_exam_df.to_pickle(folder_path+'/adm_exams_df.pkl')

In [None]:
all_exams_minmax = None
exams_minmax_df = None

In [None]:
id_list = adm_exam_df['Admission ID'].unique()
patient_list_by_id = adm_exam_df.groupby(adm_exam_df['Admission ID'])

In [None]:
exam_dict = dict()
for patient_id in id_list:
    curr_patient = patient_list_by_id.get_group(patient_id)
    curr_patient = curr_patient.loc[(curr_patient['Exam Date']-curr_patient['Admission Date']) < timedelta(days=1)]
    if curr_patient.empty:
        continue
    mortality_label = curr_patient['Discharge Reason'].unique()[0]
    if 'alta' in mortality_label.lower():
        mortality_label = 'Alta'
    elif 'obito' in mortality_label.lower() or 'óbito' in mortality_label.lower():
        mortality_label = 'Obito'
    else:
        mortality_label = 'Other'
    length_of_stay_label = curr_patient['Discharge Date'].unique()[0] - curr_patient['Admission Date'].unique()[0] > np.timedelta64(7, 'D')
    curr_patient.drop(['Admission ID','Hospital ID','Admission Date','Discharge Date','Discharge Reason','Exam Date'], inplace=True, axis=1) 
    features = [0]*len(curr_patient.columns)
    for index, row in curr_patient.iterrows():
        new_features = row.tolist()
        for i in range(len(features)):
            if new_features[i] == 0:
                continue
            features[i] = new_features[i]
    exam_dict[patient_id] = (features, length_of_stay_label, mortality_label)

In [None]:
with open(folder_path+'/exam_feature_dict.pkl', 'wb') as exam_file:
    pickle.dump(exam_dict, exam_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
adm_exam_df = None
patient_list_by_id = None
id_list = None
exam_dict = None

## CLINICAL NOTES

In [None]:
with open(clinicalNote_path, encoding="utf-8") as ap:
    reader = csv.reader(ap)
    all_notes = []
    first = True
    for row in reader:
        if first:
            first = False
            continue
        hospitalID = row[0]
        admissionID = row[2]
        note_date = datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S.%f')
        text = row[4]
        all_notes.append([hospitalID,admissionID,note_date,text])
        
    all_notes = np.array(all_notes)
    colms = ['Hospital ID', 'Admission ID','Note Date','Note Text']
    clinicalNote_df = pd.DataFrame(all_notes, columns = colms)

In [None]:
adm_note_df = pd.merge(admission_df, clinicalNote_df, on=['Hospital ID','Admission ID'], how='left')
adm_note_df.drop(['Hospital ID','Age','Branca','Negra','Parda','Amarela','F','M'], inplace=True, axis=1)

In [None]:
admission_df = None
all_notes = None

In [None]:
id_list = adm_note_df['Admission ID'].unique()
patient_list_by_id = adm_note_df.groupby(adm_note_df['Admission ID'])

In [None]:
note_dict = dict()
for patient_id in id_list:
    curr_patient = patient_list_by_id.get_group(patient_id)
    curr_patient = curr_patient.loc[(curr_patient['Note Date']-curr_patient['Admission Date']) < timedelta(days=1)]
    if curr_patient.empty:
        continue
    mortality_label = curr_patient['Discharge Reason'].unique()[0]
    if 'alta' in mortality_label.lower():
        mortality_label = 'Alta'
    elif 'obito' in mortality_label.lower() or 'óbito' in mortality_label.lower():
        mortality_label = 'Obito'
    else:
        mortality_label = 'Other'
    length_of_stay_label = curr_patient['Discharge Date'].unique()[0] - curr_patient['Admission Date'].unique()[0] > np.timedelta64(7, 'D')
    curr_patient.drop(['Admission ID','Admission Date','Discharge Date','Discharge Reason','Note Date'], inplace=True, axis=1) 
    notes = []
    for index, row in curr_patient.iterrows():
        new_note = row.tolist()
        notes.append(new_note)
    note_dict[patient_id] = (notes, length_of_stay_label, mortality_label)

In [None]:
with open(folder_path+'/note_dict.pkl', 'wb') as note_file:
    pickle.dump(note_dict, note_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
note_dict = None
patient_list_by_id = None
id_list = None