In [1]:
import pandas as pd
from sklearn.utils.random import sample_without_replacement

# 1. Reading the dataset

In [2]:
df = pd.read_csv("../datasets/mimic-iii-mining-view.csv")

In [3]:
print("Number of instances: " + str(len(df)))
print("Number of attributes: " + str(len(df.columns)))

Number of instances: 10790
Number of attributes: 32


# 2. Transforming attributes to create the target

Our target will be the resistant ENTEROCOCCUS bacteria (any type). For that, we first concatenate the "culture_microorganism_name" and "culture_susceptibility" attributes.

In [4]:
df["culture_microorganism_name"].value_counts().sort_index()

culture_microorganism_name
ABIOTROPHIA/GRANULICATELLA SPECIES                                           4
AEROCOCCUS VIRIDANS                                                          1
ALPHA STREPTOCOCCI                                                          12
BACILLUS SPECIES; NOT ANTHRACIS                                              3
BETA STREPTOCOCCUS GROUP A                                                  12
BETA STREPTOCOCCUS GROUP B                                                  76
BETA STREPTOCOCCUS GROUP C                                                   2
BETA STREPTOCOCCUS GROUP G                                                   7
CORYNEBACTERIUM SPECIES (DIPHTHEROIDS)                                      11
CORYNEBACTERIUM STRIATUM                                                     1
CORYNEBACTERIUM UREALYTICUM SP. NOV.                                         3
CORYNEBCATERIUM AMYCOLATUM                                                   1
ENTEROCOCCUS AVIUM       

In [5]:
df["culture_susceptibility"].value_counts().sort_index()

culture_susceptibility
R    1889
S    8901
Name: count, dtype: int64

In [6]:
# Concatenate these two attributes.
df["culture_microorganism_name_AND_susceptibility"] = df["culture_microorganism_name"] + "-" + df["culture_susceptibility"]
df.drop(columns=["culture_microorganism_name", "culture_susceptibility"], inplace=True)

In [7]:
df["culture_microorganism_name_AND_susceptibility"].value_counts().sort_index()

culture_microorganism_name_AND_susceptibility
ABIOTROPHIA/GRANULICATELLA SPECIES-S                                           4
AEROCOCCUS VIRIDANS-S                                                          1
ALPHA STREPTOCOCCI-S                                                          12
BACILLUS SPECIES; NOT ANTHRACIS-S                                              3
BETA STREPTOCOCCUS GROUP A-S                                                  12
BETA STREPTOCOCCUS GROUP B-S                                                  76
BETA STREPTOCOCCUS GROUP C-S                                                   2
BETA STREPTOCOCCUS GROUP G-S                                                   7
CORYNEBACTERIUM SPECIES (DIPHTHEROIDS)-S                                      11
CORYNEBACTERIUM STRIATUM-S                                                     1
CORYNEBACTERIUM UREALYTICUM SP. NOV.-S                                         3
CORYNEBCATERIUM AMYCOLATUM-S                                   

In [8]:
df["culture_microorganism_name_AND_susceptibility"].replace({"ENTEROCOCCUS FAECIUM-R" : "ENTEROCOCCUS_SP.-R",
                                                             "ENTEROCOCCUS FAECIUM-S" : "ENTEROCOCCUS_SP.-S",
                                                             "ENTEROCOCCUS CASSELIFLAVUS-R" : "ENTEROCOCCUS_SP.-R",
                                                             "ENTEROCOCCUS CASSELIFLAVUS-S" : "ENTEROCOCCUS_SP.-S",
                                                             "ENTEROCOCCUS FAECALIS-R" : "ENTEROCOCCUS_SP.-R",
                                                             "ENTEROCOCCUS FAECALIS-S" : "ENTEROCOCCUS_SP.-S",
                                                             "ENTEROCOCCUS GALLINARUM-R" : "ENTEROCOCCUS_SP.-R",
                                                             "ENTEROCOCCUS GALLINARUM-S" : "ENTEROCOCCUS_SP.-S",
                                                             "ENTEROCOCCUS RAFFINOSUS-R" : "ENTEROCOCCUS_SP.-R",
                                                             "ENTEROCOCCUS RAFFINOSUS-S" : "ENTEROCOCCUS_SP.-S",
                                                             "ENTEROCOCCUS MUNDTII-R" : "ENTEROCOCCUS_SP.-R",
                                                             "ENTEROCOCCUS AVIUM-S" : "ENTEROCOCCUS_SP.-S",
                                                             "ENTEROCOCCUS SP.-R" : "ENTEROCOCCUS_SP.-R",
                                                             "ENTEROCOCCUS SP.-S" : "ENTEROCOCCUS_SP.-S",
                                                            }, inplace=True)

In [9]:
df["culture_microorganism_name_AND_susceptibility"].value_counts().sort_index()

culture_microorganism_name_AND_susceptibility
ABIOTROPHIA/GRANULICATELLA SPECIES-S                                           4
AEROCOCCUS VIRIDANS-S                                                          1
ALPHA STREPTOCOCCI-S                                                          12
BACILLUS SPECIES; NOT ANTHRACIS-S                                              3
BETA STREPTOCOCCUS GROUP A-S                                                  12
BETA STREPTOCOCCUS GROUP B-S                                                  76
BETA STREPTOCOCCUS GROUP C-S                                                   2
BETA STREPTOCOCCUS GROUP G-S                                                   7
CORYNEBACTERIUM SPECIES (DIPHTHEROIDS)-S                                      11
CORYNEBACTERIUM STRIATUM-S                                                     1
CORYNEBACTERIUM UREALYTICUM SP. NOV.-S                                         3
CORYNEBCATERIUM AMYCOLATUM-S                                   

We then transform our target to binary.

In [10]:
target = ("culture_microorganism_name_AND_susceptibility", "ENTEROCOCCUS_SP.-R")
df[target[0]].loc[df[target[0]] != target[1]] = "noTarget"

In [11]:
df["culture_microorganism_name_AND_susceptibility"].value_counts()

culture_microorganism_name_AND_susceptibility
noTarget              8918
ENTEROCOCCUS_SP.-R    1872
Name: count, dtype: int64

# 3. Balancing the dataset

To apply the machine learning techniques, the target must be balanced. For that, we make a subsample of the negative instances.

In [12]:
df_positive_instances = df[ (df[target[0]] == target[1]) ]
df_negative_instances = df[ (df[target[0]] != target[1]) ]
df_negative_instances_subsample = df_negative_instances.iloc[ sample_without_replacement(len(df_negative_instances), len(df_positive_instances), random_state = 100) ]

In [13]:
df_final = pd.concat([df_positive_instances, df_negative_instances_subsample])
df_final.reset_index(drop=True, inplace=True)

In [14]:
print("Number of instances: " + str(len(df_final)))
print("Number of attributes: " + str(len(df_final.columns)))

Number of instances: 3744
Number of attributes: 31


In [15]:
df_final["culture_microorganism_name_AND_susceptibility"].value_counts()

culture_microorganism_name_AND_susceptibility
ENTEROCOCCUS_SP.-R    1872
noTarget              1872
Name: count, dtype: int64

In [16]:
# Values of the columns.
for c in df_final.columns:
    print("** COLUMN " + c.upper() + " **")
    print(df_final[c].value_counts())

** COLUMN PATIENT_GENDER **
patient_gender
M    2061
F    1683
Name: count, dtype: int64
** COLUMN PATIENT_AGE **
patient_age
ADULT      1867
ELDERLY    1842
CHILD        35
Name: count, dtype: int64
** COLUMN READMISSION **
readmission
no     2345
yes    1399
Name: count, dtype: int64
** COLUMN EXITUS **
exitus
no     2881
yes     863
Name: count, dtype: int64
** COLUMN ADMISSION_TYPE **
admission_type
EMERGENCY    3443
ELECTIVE      269
NEWBORN        32
Name: count, dtype: int64
** COLUMN ADMISSION_LOCATION **
admission_location
EMERGENCY_ROOM_ADMIT         1714
TRANSFER_FROM_HOSP/EXTRAM     824
CLINIC_REFERRAL/PREMATURE     596
PHYS_REFERRAL/NORMAL_DELI     520
TRANSFER_FROM_SKILLED_NUR      47
TRANSFER_FROM_OTHER_HEALT      42
NO_INFO                         1
Name: count, dtype: int64
** COLUMN DISCHARGE_LOCATION **
discharge_location
REHAB/DISTINCT_PART_HOSP     1025
DEAD/EXPIRED                  863
SNF                           550
HOME_HEALTH_CARE              530
LONG_TERM_C

# 4. Save the final dataset as csv

In [17]:
df_final.to_csv("mimic-iii-for-experiments.csv", index = False)