In [1]:
import pandas as pd
import json
import os
import numpy as np
import ast

# MAP ICD-9-CM to CCS

In [3]:
mimic_path = '/ssd2/sanghoon/data/EHR/mimic3/mimic3' #your mimic3 folder
save_path = '../data'
patitentTM_preprocessing_path='/home/sanghoon/drug/project/EHR/PatientTM/data/extended/preprocessing'# your patientIM prerpocessing folder


In [10]:
# using code : []
def featureToIdx(features):
    feature2idx = {}
    feature2idx["0"] = 0 #will be used to mask padding "codes" in the model
    idx=1
    for entry in features:
        if entry in feature2idx.keys():
            pass
        else:# print(idx, entry)
            feature2idx[entry] = idx
            idx+=1
    return feature2idx


def getICDlevel1(icd9_code):
    """
    This method extracts the first level of hierarchy of an ICD code:
        - Procedure codes start with P_xxxx and only have 2 digits in the first level so we extract P_xx
        - Diagnoses codes start with D_ and can have the following formats (1) Exxx (2) Vxx (3) xxx 
    """
    if icd9_code.startswith("P"):
        return icd9_code[:4]
    elif icd9_code.startswith("D"):
        if icd9_code.startswith("D_E"):
            return icd9_code[:6]
        else:
            return icd9_code[:5]

        
def map_ICD9_to_CCS(patiemtTM_proprocess_path,pandasDataFrame):
    # icd_cds_path : PatientTM's merged_icdccs_codes.json file path
    with open(patiemtTM_proprocess_path+'/ICDandCCSmappings/merged_icdccs_codes.json','r') as file:
        icd9TOCCS_Map = json.load(file)
    #mappedSmallICDList = []
    mappedCCSList = []
    unmapped=0
    mapped=0
    for row in pandasDataFrame.itertuples():
        #tempSmallICDCodeList = []
        tempCCSCodeList = []
        for ICD9 in row.ICD9_CODE:
            #smallICD = getICDlevel1(ICD9)
            #if smallICD not in tempSmallICDCodeList: tempSmallICDCodeList.append(smallICD)
            try:
                CCS_code = icd9TOCCS_Map[ICD9]
                tempCCSCodeList.append(CCS_code)
                mapped+=1
            except KeyError:
    ## This was previously added but we decided to simply not introduce more noise if the map is unsuccessful
                # tempCCSCodeList.append("0") #Used for NaN entries
                unmapped+=1
        #mappedSmallICDList.append(tempSmallICDCodeList)
        mappedCCSList.append(tempCCSCodeList)
    print("-Total of mapped/unmapped entries {}/{}".format(mapped,unmapped))
    return mappedCCSList #, mappedSmallICDList

In [3]:

idx_fold=patitentTM_preprocessing_path+'/idxFiles/'
emb_fold=patitentTM_preprocessing_path+'/embeddings/'


using_feature = ['DIAGNOSES_ICD', 'ADMISSIONS']
data_dict ={}

for file in using_feature:
    temp_name = file +'.csv'
    
    data_dict[file.lower()] = pd.read_csv(mimic_path+'/'+temp_name)

In [4]:
temp_y =data_dict['admissions'][['HADM_ID','HOSPITAL_EXPIRE_FLAG']].set_index('HADM_ID')
diagnosis = pd.DataFrame(data_dict['diagnoses_icd'].groupby('HADM_ID')['ICD9_CODE'].apply(list))


diagnosis =diagnosis.rename(columns={'ICD9_CODE': 'diagnosis_icd9'}).reset_index()


merge_pd = pd.merge(data_dict['admissions'],diagnosis,how='left', on='HADM_ID')


In [11]:
df_diagnoses = data_dict['diagnoses_icd'][data_dict['diagnoses_icd'].ICD9_CODE.notna()]
df_diagnoses = df_diagnoses.sort_values(['HADM_ID','SEQ_NUM'], ascending=True)
df_diagnoses = df_diagnoses.reset_index(drop = True)
df_diagnoses.ICD9_CODE = "D_" + df_diagnoses.ICD9_CODE.astype(str)
df_diag_listing = df_diagnoses.groupby('HADM_ID')['ICD9_CODE'].apply(list)
df_diag_listing = df_diag_listing.reset_index()
diagnosesCCS = map_ICD9_to_CCS(patitentTM_preprocessing_path,df_diag_listing)
#df_diag_listing['SMALL_DIAG_ICD9'] = smallICDs
df_diag_listing['DIAG_CCS'] = diagnosesCCS

-Total of mapped/unmapped entries 651000/0


In [35]:
df_diag_listing['ICD9_CODE'][0]

['D_25013',
 'D_3371',
 'D_5849',
 'D_5780',
 'D_V5867',
 'D_25063',
 'D_5363',
 'D_4580',
 'D_25043',
 'D_40390',
 'D_5853',
 'D_25053',
 'D_36201',
 'D_25083',
 'D_7078',
 'D_V1351']

In [14]:
df_adm = pd.merge(merge_pd,
                  df_diag_listing[['HADM_ID','ICD9_CODE','DIAG_CCS']],
                  on = ['HADM_ID'],
                  how = 'left')
df_adm = df_adm.rename(columns={'ICD9_CODE': 'DIAG_ICD9'})

In [46]:
df_adm.to_csv(save_path+'/preprocessed.csv',index=False)

# CCS code indexing

In [19]:
idx_fold=patitentTM_preprocessing_path+'/idxFiles/'
emb_fold=patitentTM_preprocessing_path+'/embeddings/'

with open(idx_fold+'CCSToIdx.json',"r") as file:
    ccs2idx = json.load(file)


In [20]:
import ast
def convert2idx(df_row, idx_dict, col_name):
    
    if pd.isnull(df_row[col_name]):
        return np.nan
    else:
        key_list = ast.literal_eval(df_row[col_name])
        idx_list=[]
        for key in key_list:
            
            if key in ['P_3601', 'P_3602','D_71970','P_3605']:
                idx_list.append(0)
            else:
                try:
                    idx_list.append(idx_dict[key])
                except:
                    idx_list.append(idx_dict[key.split('_')[0]+'_0'+ key.split('_')[1]])
                    #idx_list.append(idx_dict[convert_dict[key]])
            
        #idx_list = list(map(lambda x : idx_dict[x], key_list))

        return idx_list
def check_del_key(df_row, idx_dict, col_name):
    if pd.isnull(df_row[col_name]):
        return 'nan'
    else:
        key_list = ast.literal_eval(df_row[col_name])
        idx_list=[]
        for key in key_list:
            try:
                idx_dict[key]
                return 'nan'
            except:
                return key

In [49]:
data = pd.read_csv(save_path+'/preprocessed.csv')
data['DIAG_CCS'] = data.apply(lambda x : convert2idx(x, ccs2idx, 'DIAG_CCS'),axis=1)

In [57]:
data.to_csv(save_path+'/preprocessed_emb_idx.csv',index=False)