# Data Preprocessing

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 1.1 Loading the Dataset

#### 1.1.1 Reloading the Dataset

In [56]:
# Reloading the dataset and displaying in the desired format
df_basic = pd.read_csv('patient.csv')
df_basic.head(6)

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,Unnamed: 83,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,,0
5,33181,74489,83,67.0,27.56,0,Caucasian,M,190.5,Accident & Emergency,...,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic,,0


#### 1.1.2 Removing duplicate rows and dropping completely empty columns

In [58]:
df_basic = df_basic.drop_duplicates()

df_basic = df_basic.dropna(axis=1, how='all')

#### 1.1.3 Checking the updated structure

In [59]:
df_basic.head(6)

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0
5,33181,74489,83,67.0,27.56,0,Caucasian,M,190.5,Accident & Emergency,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic,0


### 1.2 Missing Value Analysis

#### 1.2.1 Calculate Missing Values and their Percentages

In [60]:
missing_values = df_basic.isnull().sum()
missing_percentage = (missing_values / len(df_basic)) * 100

# Create a missing data report
missing_data_report = pd.DataFrame({
    "Column": df_basic.columns,
    "Missing Values": missing_values,
    "Missing Percentage (%)": missing_percentage
}).sort_values(by="Missing Percentage (%)", ascending=False)

# Display columns with missing values
missing_data_report[missing_data_report["Missing Values"] > 0]

Unnamed: 0,Column,Missing Values,Missing Percentage (%)
d1_potassium_max,d1_potassium_max,9585,10.451081
d1_potassium_min,d1_potassium_min,9585,10.451081
h1_mbp_noninvasive_max,h1_mbp_noninvasive_max,9084,9.904812
h1_mbp_noninvasive_min,h1_mbp_noninvasive_min,9084,9.904812
apache_4a_icu_death_prob,apache_4a_icu_death_prob,7947,8.665075
...,...,...,...
d1_sysbp_min,d1_sysbp_min,159,0.173367
d1_heartrate_max,d1_heartrate_max,145,0.158102
d1_heartrate_min,d1_heartrate_min,145,0.158102
icu_admit_source,icu_admit_source,112,0.122120


#### 1.2.2 Handling Missing Values

In [61]:
# Strategy 1: Drop columns with more than 50% missing values
threshold = 50  # Percentage threshold for dropping columns
columns_to_drop = missing_data_report[missing_data_report["Missing Percentage (%)"] > threshold]["Column"]
df_basic = df_basic.drop(columns=columns_to_drop)

# Strategy 2: Impute missing values for numeric columns with mean
numeric_columns = df_basic.select_dtypes(include=["float64", "int64"]).columns
df_basic[numeric_columns] = df_basic[numeric_columns].apply(lambda col: col.fillna(col.mean()))

# Strategy 3: Impute missing values for categorical columns with mode
categorical_columns = df_basic.select_dtypes(include=["object"]).columns
df_basic[categorical_columns] = df_basic[categorical_columns].apply(lambda col: col.fillna(col.mode()[0]))

# Check for remaining missing values
remaining_missing_values = df_basic.isnull().sum().sum()

# Display the updated dataset structure and check if missing values remain
df_basic.info(), remaining_missing_values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91713 entries, 0 to 91712
Data columns (total 84 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   encounter_id                   91713 non-null  int64  
 1   patient_id                     91713 non-null  int64  
 2   hospital_id                    91713 non-null  int64  
 3   age                            91713 non-null  float64
 4   bmi                            91713 non-null  float64
 5   elective_surgery               91713 non-null  int64  
 6   ethnicity                      91713 non-null  object 
 7   gender                         91713 non-null  object 
 8   height                         91713 non-null  float64
 9   icu_admit_source               91713 non-null  object 
 10  icu_id                         91713 non-null  int64  
 11  icu_stay_type                  91713 non-null  object 
 12  icu_type                       91713 non-null 

(None, 0)

### 1.3 Identifying and removing non-relevant features

In [62]:
# Remove non-related columns based on domain knowledge
non_related_columns = ['encounter_id', 'patient_id', 'hospital_id', 'Unnamed: 83', 'icu_id']
df_basic = df_basic.drop(columns=non_related_columns, errors='ignore')