## Filter target patients 

### Import most libraries needed 

In [1]:
import pandas as pd
from constants import MIMIC_PATH, TEMP_PATH

### Define most constants needed
- CKD: Chronic kidney disease (suy thận mãn tính)
- DKA: Diabetic ketoacidosis (đái tháo đường)

In [2]:
# Define ICD-9/10 codes for DKA
DKA_T1_CODE_V9 = [
    "25011",
    "25013",
]
DKA_T2_CODE_V9 = [
    "25010",
    "25012",
]
DKA_CODE_V9 = (
    [
        "24910",  # Secondary diabetes mellitus with ketoacidosis
        "24911",  # Secondary diabetes mellitus -> pregnant
    ]
    + DKA_T1_CODE_V9
    + DKA_T2_CODE_V9
)

DKA_T1_CODE_V10 = [
    "E101",  # Type 1 diabetes mellitus with ketoacidosis
    "E1010",
    "E1011",
]
DKA_T2_CODE_V10 = [
    "E111",  # Type 2 diabetes mellitus with ketoacidosis
    "E1110",
    "E1111",
]
DKA_CODE_V10 = (
    [
        "E081",  # Diabetes mellitus due to underlying condition with ketoacidosis
        "E0810",
        "E0811",
        "E091",  # Drug or chemical induced diabetes mellitus with ketoacidosis
        "E0910",
        "E0911",
        "E131",  # Other specified diabetes mellitus with ketoacidosis
        "E1310",
        "E1311",
        # TODO: add E141 family (not in current version 2.2)
    ]
    + DKA_T1_CODE_V10
    + DKA_T2_CODE_V10
)

# Define CKD stage 5 codes
CKD5_CODE_V9 = [
    "40301",  # Hypertensive chronic kidney disease, malignant, with chronic kidney disease stage V or end stage renal disease
    "40311",  # Hypertensive chronic kidney disease, benign, with chronic kidney disease stage V or end stage renal disease
    "40391",  # Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage V or end stage renal disease
    "40402",
    "40403",
    "40412",
    "40413",
    "40492",
    "40493",
    "5855",  # Stage 5
    "5856",  # End stage renal disease
]
CKD5_CODE_V10 = [
    "I120",  # Hypertensive
    "I1311",
    "I132",
    "N185",  # stage 5
    "N186",  # End stage renal disease
]

print(f"{DKA_CODE_V9}")
print(f"{DKA_CODE_V10}")

print(f"{CKD5_CODE_V9}")
print(f"{CKD5_CODE_V10}")

### Filter patients 
1. Only patients that caught DKA
1. Filtering repeated admissions during one hospitalization, take first
1. Remove attributes with more than 20% missing data

#### Read icd_code 

In [3]:
dfDiagnosesIcd = pd.read_csv(str(MIMIC_PATH / "hosp" / "diagnoses_icd.csv"))


# refine some column data
dfDiagnosesIcd["icd_code"] = dfDiagnosesIcd["icd_code"].astype(str)
dfDiagnosesIcd["icd_version"] = dfDiagnosesIcd["icd_version"].astype(int)

dfDiagnosesIcd

#### Patients that caught DKA

In [4]:
from math import nan


dfDiagnosesIcd["dka_type"] = nan

##### Mark have DKA

In [5]:
dkaCondition = (
    (dfDiagnosesIcd["icd_version"] == 10) & dfDiagnosesIcd["icd_code"].isin(DKA_CODE_V10)
) | ((dfDiagnosesIcd["icd_version"] == 9) & (dfDiagnosesIcd["icd_code"].isin(DKA_CODE_V9)))

dfDiagnosesIcd.loc[dkaCondition, "dka_type"] = 0

##### Mark type of DKA

###### Type 1 of code ICD9

In [6]:
dfDiagnosesIcd.loc[
    (dfDiagnosesIcd["icd_version"] == 9)
    & (dfDiagnosesIcd["icd_code"].isin(DKA_T1_CODE_V9)),
    "dka_type",
] = 1

###### Type 1 of code ICD10

In [7]:
dfDiagnosesIcd.loc[
    (dfDiagnosesIcd["icd_version"] == 10)
    & (dfDiagnosesIcd["icd_code"].isin(DKA_T1_CODE_V10)),
    "dka_type",
] = 1

###### Type 2 of code ICD9

In [8]:
dfDiagnosesIcd.loc[
    (dfDiagnosesIcd["icd_version"] == 9)
    & (dfDiagnosesIcd["icd_code"].isin(DKA_T2_CODE_V9)),
    "dka_type",
] = 2

###### Type 2 of code ICD10

In [9]:
dfDiagnosesIcd.loc[
    (dfDiagnosesIcd["icd_version"] == 10)
    & (dfDiagnosesIcd["icd_code"].isin(DKA_T2_CODE_V10)),
    "dka_type",
] = 2

##### Drop non DKA

In [10]:
dfDkaDiagnoses = dfDiagnosesIcd.dropna(subset=["dka_type"])
dfDkaDiagnoses

#### CKD stage 5 dianogses 

In [11]:
ckd5Condition = (
    (dfDiagnosesIcd["icd_version"] == 9)
    & dfDiagnosesIcd["icd_code"].isin(CKD5_CODE_V9)
) | (
    (dfDiagnosesIcd["icd_version"] == 10)
    & dfDiagnosesIcd["icd_code"].isin(CKD5_CODE_V10)
)

dfCkd5Diagnoses = dfDiagnosesIcd[ckd5Condition]
dfCkd5Diagnoses

#### Exclude all admission with CKD 5 

In [12]:
dfDkaExcludeCkd5 = dfDkaDiagnoses[
    ~dfDkaDiagnoses["hadm_id"].isin(dfCkd5Diagnoses["hadm_id"])
]
dfDkaExcludeCkd5

#### Exclude multiple admission
TODO: check legitimate of this action 

1. read icu stay
1. group by admission id (hadm_id), get first "intime"
1. join with data above by hadm


##### read data and remove duplicate

In [13]:
dfIcu = pd.read_csv(MIMIC_PATH / "icu" / "icustays.csv")
dfIcu["intime"] = pd.to_datetime(dfIcu["intime"])
dfIcuNoDup = dfIcu.drop_duplicates(
    subset="hadm_id", keep="first"
)
dfIcuNoDup

##### join admission

We will get patients who got dka but no ckd5, and went to ICD 

In [14]:
# check columns duplicate before hand 
sharedColumns = set(dfDkaExcludeCkd5.columns) & set(dfIcuNoDup.columns)
sharedColumns.remove("hadm_id")

sharedColumns

In [15]:
dfTargetPatients = dfDkaExcludeCkd5.merge(
    dfIcuNoDup, 
    on="hadm_id", 
    how="inner",
    suffixes=("_icd", "_icu")
)

# remove duplicated columns 
for col in sharedColumns:
    dfTargetPatients[col] = dfTargetPatients.apply(lambda row: row[col + '_icd'] if row[col + '_icd'] == row[col + '_icu'] else None, axis=1) # type: ignore
    
    # if there is a none, the data is not match, abort merge duplicate columns  
    if None in dfTargetPatients[col].values:
        dfTargetPatients.drop(col, axis=1, inplace=True)
        pass
    else:
        dfTargetPatients.drop([col + '_icd', col + '_icu'], axis=1, inplace=True)
        pass
    pass

dfTargetPatients.sort_values(["hadm_id"], inplace=True)


dfTargetPatients

In [16]:
from constants import TARGET_PATIENT_FILE

# save other data
dfTargetPatients.to_csv(TEMP_PATH / TARGET_PATIENT_FILE)

dfTargetPatients