# Preparation

In [42]:
# Necessary packages
import numpy as np
import pandas as pd
import sys
import os
import pickle

from mimic4_preprocess_util import *
from preprocess_util import *

In [39]:
import importlib

In [47]:
import preprocess_util
importlib.reload(preprocess_util)
from preprocess_util import *

In [3]:
day_cutoff = 14  # Outcome threshold
Org = 'MRSA' # Target organism

In [4]:
# Required paths
# Base paths
path_folder = "/data/mnigo/MDR_projects/MRSA"
path_clean2 = "/data/mnigo/MDR_projects/MRSA/MIMIC/clean_data_MRSA_v2.1/clean_mapped_data_" + Org + "/" +  str(day_cutoff)+'days/'
path_clean3 = path_clean2
MMH_path = '/data/mnigo/MDR_projects/MRSA/clean_data_clean_diag_simple_location_ISOLMRSA/14days/'
# Original Data
path_MRSA = path_folder + "/pytorch_ehr/Pytorch_EHR_Tutorial/Data_Prep/data/MIMIC/MRSA data/"
# Data Preparation
prefix_date = '20220618_'
label = 'label.csv'
remove_list_file = 'remove_less_18.csv'
verbose = True

# Path for PytorchEHR
sys.path.insert(0, path_folder + "/pytorch_ehr/Pytorch_EHR_Tutorial/Data_Prep/")
# The typeFile
typeFile = os.path.join(MMH_path, 'Mimic_PT_mortality_dp_v1.types')

In [14]:
# Dictionary to change the column names to fit pytoruch_ehr
d_demographic = {'Study_ID_Combine': 'subject_id', 'Study_FIN_Combine': 'hadm_id', 'M_Admit Date & Time': 'admittime',
                'M_Discharge Date & Time': 'dischtime', 'Person Location- Building (Admit)':'admit_building' ,
                'Person Location- Facility (Admit)':'admit_facility', 'Person Location- Nurse Unit (Admit)':'admit_nurse_unit',
                'Age- Years (Visit)': 'age', 'Ethnic Group-Curr': 'ethnicity', 'Language-Curr': 'language',
                 'M_Deceased Date & Time-Curr': 'deathdate','Race-Curr': 'race', 'Sex-Curr': 'gender',
                'ICD10 Diagnosis Code': 'icd_code', 'ICD10 Diagnosis Description': 'icd10_des',
                 'M_Diagnosis Date & Time': 'chartdate', 'M_Check In Date/Time': 'chartdate', 'PostOp Diagnosis': 'postopdx',
                'Scheduled Anesthesia Type': 'anethtype', 'Procedure Code': 'proc_code', 'M_Procedure Date and Time': 'chartdate',
                'M_Drawn Date & Time': 'chartdate', 'M_Clinical Event End Date/Time': 'chartdate'}
d_demographic2 = {'M_Local_System_ID': 'subject_id'}       
dtypes = {'subject_id': str, 'Study_ID_Combine':str, 'Study_FIN_Combine': str}
date_columns = ['chartdate', 'date']
# Define prefixes for files
prefixes = ['case_14days', 'control_14days', 'case_demographic2', 'control_demographic2', 'case_antibiotics_true2',
            'control_antibiotics_true2', 'case_culture_result2', 'control_culture_result2',
            'case_demographic3', 'control_demographic3', 'case_procedure', 'control_procedure',
            'case_micro_order', 'control_micro_order', 'case_micro_order2', 'control_micro_order2',
            'case_diagnosis', 'control_diagnosis', 'case_lab_order', 'control_lab_order',
            'case_lab_result', 'control_lab_result', 'case_micro_sensitivity2', 'control_micro_sensitivity2']

In [28]:
remove = pd.read_csv(path_MRSA + remove_list_file, dtype=str)
remove_list = remove['subject_id'].unique().tolist()

In [16]:
# Some useful methods
def load_data(file_path, prefix, dtypes, date_columns=None):
    file_name = prefix + '.csv'
    data = pd.read_csv(file_path + file_name, dtype=dtypes, parse_dates=date_columns)
    if 'subject_id' in data.columns:
        remove = pd.read_csv(path_MRSA + remove_list_file, dtype=str)
        remove_list = remove['subject_id'].unique().tolist()
        data = data[~data['subject_id'].isin(remove_list)]
    #if date_columns:
    #    data = date_change(data)
    return data

def filter_and_concat(dataframes, subjects_index, subjects_index2):
    filtered_dataframes = [df[df['subject_id'].isin(subjects_index)] for df in dataframes]
    filtered_dataframes2 = [df[df['subject_id'].isin(subjects_index2)] for df in dataframes]

    return pd.concat(filtered_dataframes + filtered_dataframes2).dropna().drop_duplicates()

def save_to_tsv(dataframe, file_path):
    dataframe.to_csv(file_path, sep='\t', index=False)

In [17]:
def date_change(data):
    data['chartdate'] = data['chartdate'].astype('datetime64')
    data['date'] = data['chartdate'].dt.date
    data['date'] = data['date'].astype('datetime64') 
    return data

def check_terms(data, term):
    return data[data['event_code'].isin(term)]

def check_terms_l(data, term):
    return data[data['event_code'].str.lower().str.contains('|'.join(term))]

def check_terms_c(data, term):
    return data[data['event_code'].str.contains('|'.join(term))]

def extract_columns(data):
    data = data[['subject_id', 'new_subject_id', 'date', 'event_code']]
    data = data[~data['subject_id'].isin(remove_list)]
    return data

# Load data and prepare for the data preprocessing

In [19]:
# Load data using the function
data_dict = {prefix: load_data(path_clean2, prefix, dtypes, date_columns) for prefix in prefixes}



In [25]:
# Extract the values from the dictionary
values = list(data_dict.values())

# Unpack data from the dictionary
(case, cntrl, case_demo, cntrl_demo, case_abx, cntrl_abx, case_cx_results, cntrl_cx_results,
 case_demo3, cntrl_demo3, case_proc, cntrl_proc, case_order, cntrl_order,
 case_order2, cntrl_order2, case_dx, cntrl_dx, case_order_loc, cntrl_order_loc,
 case_order_result, cntrl_order_result, case_sensitivity, cntrl_sensitivity) = values

In [29]:
dataframes = [case_demo, cntrl_demo, case_dx, cntrl_dx, case_abx, cntrl_abx, 
              case_order_loc, cntrl_order_loc, case_order_result, cntrl_order_result, 
              case_sensitivity, cntrl_sensitivity, case_cx_results, cntrl_cx_results, 
              case_demo3, cntrl_demo3, case_order, cntrl_order, case_order2, cntrl_order2, 
              case_proc, cntrl_proc]

for df in dataframes:
    df = extract_columns(df)

In [30]:
case['mort']=1
cntrl['mort']=0
case['tte'] = case[str(day_cutoff)+'_'+ Org + '_positive_day'].str[:-4].astype(int)
cntrl['tte'] = day_cutoff #.str[:-4].astype(int)

In [31]:
types_d2=pickle.load(open(typeFile, 'rb'), encoding='bytes')
types_d_rev_org = dict(zip(types_d2.values(),types_d2.keys()))
dict_list = list(types_d2.keys())
dict_list = list(set(dict_list))
dict_df = pd.DataFrame(dict_list)
dict_df = dict_df[~dict_df[0].str.contains('ADM_DX')]
check_terms_l(case_order_result, ['negative'])

Unnamed: 0,subject_id,chartdate,event_code,date,new_subject_id
0,19339920,2109-06-18 09:30:00,TEST_(STAT) Hep Bs Ag_Blood_0_Result Type$Nega...,2109-06-19,19339920_1
1,19339920,2109-06-18 09:30:00,TEST_HBc Ab_Blood_0_Result Type$Negative,2109-06-19,19339920_1
2,19339920,2109-06-18 09:30:00,TEST_Hep C Ab_Blood_0_Result Type$Negative,2109-06-19,19339920_1
16,14982471,2110-06-09 08:40:00,TEST_Hep Bs Ab Immune Status (Qst)_Blood_0_Res...,2110-06-10,14982471_5
17,14982471,2110-06-09 08:40:00,TEST_Hep C Ab_Blood_0_Result Type$Negative,2110-06-10,14982471_5
...,...,...,...,...,...
12995,13774759,2203-11-21 15:43:00,TEST_CDC HIV 4th GEN_Blood_0_Result Type$Negative,2203-11-21,13774759_12
12996,13774759,2203-11-21 15:43:00,TEST_(STAT) Hep Bs Ag_Blood_0_Result Type$Nega...,2203-11-22,13774759_12
12997,13774759,2203-11-21 15:43:00,TEST_Hep C Ab_Blood_0_Result Type$Negative,2203-11-22,13774759_12
12999,17370506,2197-11-07 10:40:00,TEST_Hep Bs Ab Immune Status (Qst)_Blood_0_Res...,2197-11-08,17370506_5


In [49]:
# Restructure ICD codes 
case_dx = ICD_restructure(case_dx, dict_df)
cntrl_dx = ICD_restructure(cntrl_dx, dict_df)
# Restructure procedures
case_proc = Proc_restructure(case_proc, dict_df)
cntrl_proc = Proc_restructure(cntrl_proc, dict_df)

In [None]:
dataframes_to_check = [case_demo, cntrl_demo, case_abx, cntrl_abx, case_demo3, cntrl_demo3, 
                       case_order, cntrl_order, case_order2, cntrl_order2, case_order_loc, 
                       cntrl_order_loc, case_order_result, cntrl_order_result, case_sensitivity, 
                       cntrl_sensitivity, case_cx_results, cntrl_cx_results]

def apply_check_terms(df, columns):
    return check_terms(df, columns)

for df in dataframes_to_check:
    df = apply_check_terms(df, dict_df[0].unique())

In [None]:
# Split datasets into train, valid, and test sets
train_mrns_case, valid_mrns_case, test_mrns_case = split(case)
temp = cntrl[~cntrl['subject_id'].isin(case['subject_id'].unique())]
train_mrns_cnt, valid_mrns_cnt, test_mrns_cnt = split(temp)
# Concatenate MRNs for train, valid, and test sets
train_mrns = train_mrns_case.append(train_mrns_cnt)
valid_mrns = valid_mrns_case.append(valid_mrns_cnt)
test_mrns = test_mrns_case.append(test_mrns_cnt)
# Create copies of MRNs
train_mrns2 = train_mrns.copy()
valid_mrns2 = valid_mrns.copy()
test_mrns2 = test_mrns.copy()

In [None]:
subjects_indices = {
    'train': (train_mrns[1].unique(), train_mrns2[1].unique()),
    'valid': (valid_mrns[1].unique(), valid_mrns2[1].unique()),
    'test': (test_mrns[1].unique(), test_mrns2[1].unique())
}

for set_type, (index, index2) in subjects_indices.items():
    dataframes_to_concat = [case_demo, cntrl_demo, case_demo3, cntrl_demo3, case_dx, cntrl_dx,
                            case_abx, cntrl_abx, case_proc, cntrl_proc, case_order_loc,
                            cntrl_order_loc, case_order, cntrl_order, case_order2, cntrl_order2,
                            case_order_result, cntrl_order_result, case_cx_results, cntrl_cx_results,
                            case_sensitivity, cntrl_sensitivity]

    concatenated_dataframe = filter_and_concat(dataframes_to_concat, index, index2)
    save_to_tsv(concatenated_dataframe, f'{path_clean3}/MHH_PT_data_dp2_{set_type}.tsv')

    case_temp = case[case['subject_id'].isin(index)][['subject_id', 'new_subject_id', 'mort', 'tte']].drop_duplicates()
    cntrl_temp = cntrl[cntrl['subject_id'].isin(index2)][['subject_id', 'new_subject_id', 'mort', 'tte']].drop_duplicates()

    outcome_labels = pd.concat([case_temp, cntrl_temp]).dropna()
    save_to_tsv(outcome_labels, f'{path_clean3}/MHH_PT_outcome_labels2_{set_type}.tsv')

# The actual data preprocess

In [None]:
## Proprocess data into pickled list
sys.path.insert(0, path_folder + "/pytorch_ehr/Pytorch_EHR_Tutorial/Data_Prep/")
from preprocess_outcomes  import dump_split_process_data
from preprocess_outcomes  import *

dataFile  = path_clean3+'/MHH_PT_data_dp2_'
labelFile = path_clean3+'/MHH_PT_outcome_labels2_' 
MMH_path = '/data/mnigo/MDR_projects/MRSA/clean_data_clean_diag_simple_location_ISOLMRSA/14days/'
typeFile = MMH_path +'Mimic_PT_mortality_dp_v1.types'

In [None]:
dump_split_process_data('train', dataFile, labelFile, typeFile ,path_clean3+'/Mimic_PT_mortality_dp_v2' , 'NA')

In [None]:
typeFile = path_clean3+ 'Mimic_PT_mortality_dp_v2.types'
dump_split_process_data('valid', dataFile, labelFile, typeFile ,path_clean3+'/Mimic_PT_mortality_dp_v2' , 'NA')

In [None]:
dump_split_process_data('test', dataFile, labelFile, typeFile ,path_clean3+'/Mimic_PT_mortality_dp_v2' , 'NA')