# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import category_encoders as ce
from imblearn.over_sampling import SMOTENC

### 1.1 Loading the Dataset

#### 1.1.1 Reloading the Dataset

In [3]:
# Reloading the dataset and displaying in the desired format
master_data = pd.read_csv('../00-dataset/00-raw-dataset/dataset.csv')
master_data.head(6)

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,Unnamed: 83,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,,0
5,33181,74489,83,67.0,27.56,0,Caucasian,M,190.5,Accident & Emergency,...,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic,,0


#### 1.1.2 Removing duplicate rows and dropping completely empty columns

In [4]:
master_data = master_data.drop_duplicates()
master_data = master_data.dropna(axis=1, how='all')

#### 1.1.3 Checking the updated structure

In [5]:
master_data.head(6)

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0
5,33181,74489,83,67.0,27.56,0,Caucasian,M,190.5,Accident & Emergency,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic,0


### 1.2 Missing Value Analysis

#### 1.2.1 Calculate Missing Values and their Percentages

In [6]:
missing_values = master_data.isnull().sum()
missing_percentage = (missing_values / len(master_data)) * 100

# Create a missing data report
missing_data_report = pd.DataFrame({
    "Column": master_data.columns,
    "Missing Values": missing_values,
    "Missing Percentage (%)": missing_percentage
}).sort_values(by="Missing Percentage (%)", ascending=False)

# Display columns with missing values
missing_data_report[missing_data_report["Missing Values"] > 0]

Unnamed: 0,Column,Missing Values,Missing Percentage (%)
d1_potassium_max,d1_potassium_max,9585,10.451081
d1_potassium_min,d1_potassium_min,9585,10.451081
h1_mbp_noninvasive_max,h1_mbp_noninvasive_max,9084,9.904812
h1_mbp_noninvasive_min,h1_mbp_noninvasive_min,9084,9.904812
apache_4a_icu_death_prob,apache_4a_icu_death_prob,7947,8.665075
...,...,...,...
d1_sysbp_min,d1_sysbp_min,159,0.173367
d1_heartrate_max,d1_heartrate_max,145,0.158102
d1_heartrate_min,d1_heartrate_min,145,0.158102
icu_admit_source,icu_admit_source,112,0.122120


#### 1.2.2 Handling Missing Values

In [7]:
# Strategy 1: Impute missing values for numeric columns with mean
numeric_columns = master_data.select_dtypes(include=["float64", "int64"]).columns
master_data[numeric_columns] = master_data[numeric_columns].apply(lambda col: col.fillna(col.mean()))

# Strategy 2: Impute missing values for categorical columns with mode
categorical_columns = master_data.select_dtypes(include=["object"]).columns
master_data[categorical_columns] = master_data[categorical_columns].apply(lambda col: col.fillna(col.mode()[0]))

# Check for remaining missing values
remaining_missing_values = master_data.isnull().sum().sum()

# Display the updated dataset structure and check if missing values remain
master_data.info(), remaining_missing_values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91713 entries, 0 to 91712
Data columns (total 84 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   encounter_id                   91713 non-null  int64  
 1   patient_id                     91713 non-null  int64  
 2   hospital_id                    91713 non-null  int64  
 3   age                            91713 non-null  float64
 4   bmi                            91713 non-null  float64
 5   elective_surgery               91713 non-null  int64  
 6   ethnicity                      91713 non-null  object 
 7   gender                         91713 non-null  object 
 8   height                         91713 non-null  float64
 9   icu_admit_source               91713 non-null  object 
 10  icu_id                         91713 non-null  int64  
 11  icu_stay_type                  91713 non-null  object 
 12  icu_type                       91713 non-null 

(None, np.int64(0))

### 1.3 Identifying and removing non-relevant features

In [8]:
# Remove non-related columns based on domain knowledge
non_related_columns = ['encounter_id', 'patient_id', 'hospital_id', 'icu_id']
master_data = master_data.drop(columns=non_related_columns, errors='ignore')

### 2.1 Create three sub-sampled datasets

#### 2.1.1 Use stratified sampling to keep intial class imbalnce of master dataset

In [9]:

# Identify column with target values for classifcation
target_column = 'hospital_death'

# Determine distribution and absolute count for each class in the master dataset
master_class_distribution = master_data[target_column].value_counts(normalize=True)
master_class_count = master_data[target_column].value_counts()
print("Master dataset class distribution (Proportion):")
print(master_class_distribution)
print("\n Master dataset class absolute count:")
print(master_class_count)

# Equation to set number of samples in each derived dataset
num_samples = len(master_data) // 3

# Group data by target class
grouped = master_data.groupby(target_column)

# Initialise a dictionary to store sampled datasets 
sampled_datasets = {
    'sampled_data_1': pd.DataFrame(),
    'sampled_data_2': pd.DataFrame(),
    'sampled_data_3': pd.DataFrame()
}

# Create 3 sub-sampled datasets 
for i, dataset_name in enumerate(sampled_datasets.keys(), start=1):
    sampled_data = pd.DataFrame()
    
    # Sample data from each class group
    # random_state is set to simulate semi-random behaviour for reproducability of experiments
    for class_label, group in grouped:
        samples_to_take = int(master_class_distribution[class_label] * num_samples)
        sampled_class_data = group.sample(n=samples_to_take, random_state=np.random.randint(1000))
        sampled_data = pd.concat([sampled_data, sampled_class_data])
    
    # Shuffle data and reset index to reduce data order bias after concatination
    sampled_datasets[dataset_name] = sampled_data.sample(frac=1, random_state=np.random.randint(1000)).reset_index(drop=True)

# Assign variables explicitly for easier access later on
sampled_data_1 = sampled_datasets['sampled_data_1']
sampled_data_2 = sampled_datasets['sampled_data_2']
sampled_data_3 = sampled_datasets['sampled_data_3']

# Print summaries of each dataset to verify proportion of classes and absolute count of data values
for name, data in sampled_datasets.items():
    print(f"\n{name} class distribution:")
    print(data[target_column].value_counts(normalize=True))
    print(f"\n{name} class absolute count:")
    print(data[target_column].value_counts())


Master dataset class distribution (Proportion):
hospital_death
0    0.913698
1    0.086302
Name: proportion, dtype: float64

 Master dataset class absolute count:
hospital_death
0    83798
1     7915
Name: count, dtype: int64

sampled_data_1 class distribution:
hospital_death
0    0.913706
1    0.086294
Name: proportion, dtype: float64

sampled_data_1 class absolute count:
hospital_death
0    27932
1     2638
Name: count, dtype: int64

sampled_data_2 class distribution:
hospital_death
0    0.913706
1    0.086294
Name: proportion, dtype: float64

sampled_data_2 class absolute count:
hospital_death
0    27932
1     2638
Name: count, dtype: int64

sampled_data_3 class distribution:
hospital_death
0    0.913706
1    0.086294
Name: proportion, dtype: float64

sampled_data_3 class absolute count:
hospital_death
0    27932
1     2638
Name: count, dtype: int64


### 2.2 Create class-imbalanced derived dataset using SMOTE-NC (Nominal Continuous)

In [10]:
# Desired rations for the minority class
desired_minority_class_ratios = [0.10, 0.30, 0.50]  

# Create empty dictionary for derived datasets
derived_datasets = {}

for i, (sampled_data, desired_minority_class_ratio) in enumerate(
    zip([sampled_data_1, sampled_data_2, sampled_data_3], desired_minority_class_ratios), start=1
):
    # Step 1: identify categorical columns for sampled datasets
    categorical_columns = sampled_data.select_dtypes(include=["object"]).columns.tolist()

    # Step 2: apply binary coding for use in SMOTE-NC
    binary_encoder = ce.BinaryEncoder(cols=categorical_columns)
    encoded_data = binary_encoder.fit_transform(sampled_data)

    # Step 3: separate features and target
    X = encoded_data.drop(columns=[target_column])
    y = encoded_data[target_column]
    
    # Step 4: define categorical indices after encoding
    binary_categorical_indices = [
        X.columns.get_loc(col) for col in X.columns if any(col.startswith(cat) for cat in categorical_columns)
    ]

    # Step 5: define correct sampling strategy for SMOTE-NC to match the desired class ratio
    minority_class_count = y.value_counts().get(1, 0)
    majority_class_count = y.value_counts().get(0, 0)
    correct_sampling_strategy = int(majority_class_count * (desired_minority_class_ratio / (1 - desired_minority_class_ratio)))

    # Step 6: apply SMOTE-NC with defined sampling strategy
    smote_nc = SMOTENC(
        categorical_features=binary_categorical_indices, 
        sampling_strategy={1: correct_sampling_strategy}, 
        random_state=42
    )
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    
    # Step 7: Combine resampled data to create derived datasets
    derived_data = pd.concat(
        [pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=[target_column])], axis=1
    )

    # Shuffle data and reset index to reduce data order bias after concatination
    derived_data = derived_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Step 8: store the derived datasets
    derived_datasets[f'derived_data_{i}'] = derived_data

    
    # Print class distribution and absolute counts for verification
    print(f"\nClass distribution for derived_data_{i} (Minority class ratio: {int(desired_minority_class_ratio * 100)}%):")
    print(derived_data[target_column].value_counts(normalize=True))
    print(f"\nClass absolute count for derived_data_{i}:")
    print(derived_data[target_column].value_counts())

# Step 9: assign variables for derived datasets
derived_data_1 = derived_datasets['derived_data_1']
derived_data_2 = derived_datasets['derived_data_2']
derived_data_3 = derived_datasets['derived_data_3']



Class distribution for derived_data_1 (Minority class ratio: 10%):
hospital_death
0    0.900016
1    0.099984
Name: proportion, dtype: float64

Class absolute count for derived_data_1:
hospital_death
0    27932
1     3103
Name: count, dtype: int64

Class distribution for derived_data_2 (Minority class ratio: 30%):
hospital_death
0    0.700015
1    0.299985
Name: proportion, dtype: float64

Class absolute count for derived_data_2:
hospital_death
0    27932
1    11970
Name: count, dtype: int64

Class distribution for derived_data_3 (Minority class ratio: 50%):
hospital_death
1    0.5
0    0.5
Name: proportion, dtype: float64

Class absolute count for derived_data_3:
hospital_death
1    27932
0    27932
Name: count, dtype: int64


### 2.3 Create Train-Validation-Test split for derived datasets

In [11]:
# Create dictionaries for the training and test splits
train_sets, test_sets = {}, {}

# Function to stratify split on each derived dataset
def stratified_split(data, target_column, train_ratio=0.7, test_ratio=0.3):
    """
    Splits a dataset into training and test sets while maintaining the same 
    class distribution as the original dataset using stratified sampling.

    Parameters:
    - data (DataFrame): The dataset to be split.
    - target_column (str): The name of the target column used for stratification.
    - train_ratio (float): Proportion of data to include in the Train set (set to 0.7).
    - test_ratio (float): Proportion of data to include in the Test set (set to 0.3).

    Returns:
    - train_data (DataFrame): stratified training set.
    - test_data (DataFrame): stratified test set.
    """
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()

    # Group the data by class and split by class, ensuring no data points are shared between the splits
    for class_label, group in data.groupby(target_column):
        group = group.sample(frac=1, random_state=42).reset_index(drop=True)
        train_size = int(train_ratio * len(group))
        train_data = pd.concat([train_data, group.iloc[:train_size]])
        test_data = pd.concat([test_data, group.iloc[train_size:]])

    # Shuffle each set after stratified sampling to reduce data order bias after concatination
    return (
        train_data.sample(frac=1, random_state=42).reset_index(drop=True),
        test_data.sample(frac=1, random_state=42).reset_index(drop=True),
    )

# Split each derived dataset with stratified split function
for i, derived_data in enumerate([derived_data_1, derived_data_2, derived_data_3], start=1):
    train_set, test_set = stratified_split(derived_data, target_column)

    # Store the splits in dictionaries
    train_sets[f'train_data_{i}'] = train_set
    test_sets[f'test_data_{i}'] = test_set

    # Print class distribution and absolute counts for verification
    print(f"\nClass distribution for train_data_{i}:")
    print(train_set[target_column].value_counts(normalize=True))
    print(f"\nClass absolute count for train_data_{i}:")
    print(train_set[target_column].value_counts())

    print(f"\nClass distribution for test_data_{i}:")
    print(test_set[target_column].value_counts(normalize=True))
    print(f"\nClass absolute count for test_data_{i}:")
    print(test_set[target_column].value_counts())

# Assign variables for derived datasets
train_data_1, test_data_1 = train_sets['train_data_1'], test_sets['test_data_1']
train_data_2, test_data_2 = train_sets['train_data_2'], test_sets['test_data_2']
train_data_3, test_data_3 = train_sets['train_data_3'], test_sets['test_data_3']


Class distribution for train_data_1:
hospital_death
0    0.900018
1    0.099982
Name: proportion, dtype: float64

Class absolute count for train_data_1:
hospital_death
0    19552
1     2172
Name: count, dtype: int64

Class distribution for test_data_1:
hospital_death
0    0.900011
1    0.099989
Name: proportion, dtype: float64

Class absolute count for test_data_1:
hospital_death
0    8380
1     931
Name: count, dtype: int64

Class distribution for train_data_2:
hospital_death
0    0.700011
1    0.299989
Name: proportion, dtype: float64

Class absolute count for train_data_2:
hospital_death
0    19552
1     8379
Name: count, dtype: int64

Class distribution for test_data_2:
hospital_death
0    0.700025
1    0.299975
Name: proportion, dtype: float64

Class absolute count for test_data_2:
hospital_death
0    8380
1    3591
Name: count, dtype: int64

Class distribution for train_data_3:
hospital_death
0    0.5
1    0.5
Name: proportion, dtype: float64

Class absolute count for train_data

### Code to reverse encoding on categorical columns in training, validation and test datasets (delete before submission)

In [12]:

# decoded_train_data_1 = binary_encoder.inverse_transform(train_data_1)
# decoded_validation_data_1 = binary_encoder.inverse_transform(validation_data_1)
# decoded_test_data_1 = binary_encoder.inverse_transform(test_data_1)

# decoded_train_data_2 = binary_encoder.inverse_transform(train_data_2)
# decoded_validation_data_2 = binary_encoder.inverse_transform(validation_data_2)
# decoded_test_data_2 = binary_encoder.inverse_transform(test_data_2)

# decoded_train_data_3 = binary_encoder.inverse_transform(train_data_3)
# decoded_validation_data_3 = binary_encoder.inverse_transform(validation_data_3)
# decoded_test_data_3 = binary_encoder.inverse_transform(test_data_3)

# # Preview results
# print("\nDecoded Train Data 1 Preview:")
# print(train_data_1.head())
# print(decoded_train_data_1.head())
# train_data_1.to_csv('train_data_1.csv', index=False)
# decoded_train_data_1.to_csv('train_data_1_decoded.csv', index=False)


### Save into csv file

In [13]:
dataframes = {
    'train_data_1': train_data_1, 'test_data_1': test_data_1,
    'train_data_2': train_data_2, 'test_data_2': test_data_2,
    'train_data_3': train_data_3, 'test_data_3': test_data_3,
}

for name, df in dataframes.items():
    df.to_csv(f'{name}.csv', index=False)