## HDAT 9910 Capstone


Research Question 1: Mortality Prediction in ICU 

Task: The task is to build a predictive algorithm using the techniques we learned in this course. 

Objective: To assess the role of machine learning algorithms for predicting mortality by using the MIMIC-III dataset. 

Question: Is it possible to accurately predict mortality based on data from the first 24 hours in ICU?   

Study Population: MIMIC-III dataset 

#### Load packages


In [415]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os
import concurrent.futures
warnings.filterwarnings("ignore")

#### Create a function to loads through all CSV files


In [443]:
def read_all_datasets(data_folder):
    
    datasets = {}

    csv_files = [
        'vitals_hourly.csv', 'admissions.csv', 'antibiotics.csv', 'bloodculture.csv',
        'gcs_hourly.csv', 'icd9_diag.csv', 'icustays.csv', 'labs_hourly.csv',
        'output_hourly.csv', 'patients.csv', 'pt_icu_outcome.csv', 'pt_stay_hr.csv',
        'pt_weight.csv', 'pv_mechvent.csv', 'transfers.csv', 'vasopressors.csv'
    ]

    for file in csv_files:
        file_path = os.path.join(data_folder, file)

        if os.path.exists(file_path):
            datasets[file.replace('.csv', '')] = pd.read_csv(file_path)
        else:
            print(f"File not found: {file}")

    return datasets

data_folder = '/Users\lukac\OneDrive\Desktop\HDAT-9910-Capstone/mimic_data/'
all_datasets = read_all_datasets(data_folder)

# Assign all files to a dataframe for exploration
admissions_df = all_datasets['admissions']
vitals_hourly_df = all_datasets['vitals_hourly']
antibiotics_df = all_datasets['antibiotics']
bloodculture_df = all_datasets['bloodculture']
gcs_hourly_df = all_datasets['gcs_hourly']
icd9_diag_df = all_datasets['icd9_diag']
icustays_df = all_datasets['icustays']
labs_hourly_df = all_datasets['labs_hourly']
pt_stay_hr_df = all_datasets['pt_stay_hr']
pt_icu_outcome_df = all_datasets['pt_icu_outcome']
patients_df = all_datasets['patients']
output_hourly_df = all_datasets['output_hourly']
pt_weight_df = all_datasets['pt_weight']
pv_mechvent_df = all_datasets['pv_mechvent']
transfers_df = all_datasets['transfers']
vasopressors_df = all_datasets['vasopressors']

In [444]:
# Handle missing values
# For numerical columns, fill missing values with the median
for col in admissions_df.select_dtypes(include=['float64', 'int64']).columns:
    admissions_df[col].fillna(admissions_df[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode
for col in admissions_df.select_dtypes(include=['object']).columns:
    admissions_df[col].fillna(admissions_df[col].mode()[0], inplace=True)

# Remove duplicate rows
admissions_df.drop_duplicates(inplace=True)
admissions_df = admissions_df.drop(columns =[
    'hadm_id', 'marital_status','religion','language',
    'ethnicity','dischtime','edregtime', 
     'hospital_expire_flag','has_chartevents_data',
     'row_id','edouttime', 'discharge_location','deathtime'])

In [445]:
# Handle missing values
# For numerical columns, fill missing values with the median
for col in patients_df.select_dtypes(include=['float64', 'int64']).columns:
    patients_df[col].fillna(patients_df[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode
for col in patients_df.select_dtypes(include=['object']).columns:
    patients_df[col].fillna(patients_df[col].mode()[0], inplace=True)

# Remove duplicate rows
patients_df.drop_duplicates(inplace=True)
patients_df = patients_df.drop(columns =[
    'dod_ssn', 'dob','dod_hosp', 'row_id', 'dod'])

In [446]:
# Handle missing values
# For numerical columns, fill missing values with the median
for col in vitals_hourly_df.select_dtypes(include=['float64', 'int64']).columns:
    vitals_hourly_df[col].fillna(vitals_hourly_df[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode
for col in vitals_hourly_df.select_dtypes(include=['object']).columns:
    vitals_hourly_df[col].fillna(vitals_hourly_df[col].mode()[0], inplace=True)

# Remove duplicate rows
vitals_hourly_df.drop_duplicates(inplace=True)
vitals_hourly_df = vitals_hourly_df.drop(columns =['fio2'])

In [447]:
# Handle missing values
# For numerical columns, fill missing values with the median
for col in gcs_hourly_df.select_dtypes(include=['float64', 'int64']).columns:
    gcs_hourly_df[col].fillna(gcs_hourly_df[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode
for col in gcs_hourly_df.select_dtypes(include=['object']).columns:
    gcs_hourly_df[col].fillna(gcs_hourly_df[col].mode()[0], inplace=True)

# Remove duplicate rows
gcs_hourly_df.drop_duplicates(inplace=True)

In [448]:
# Handle missing values
# For numerical columns, fill missing values with the median
for col in labs_hourly_df.select_dtypes(include=['float64', 'int64']).columns:
    labs_hourly_df[col].fillna(labs_hourly_df[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode
for col in labs_hourly_df.select_dtypes(include=['object']).columns:
    labs_hourly_df[col].fillna(labs_hourly_df[col].mode()[0], inplace=True)

# Remove duplicate rows
labs_hourly_df.drop_duplicates(inplace=True)
labs_hourly_df = labs_hourly_df.drop(columns =[
    'alaninetransaminase', 'aspartatetransaminase', 'albumin', 'bilirubin','glucose'])

In [449]:
# Handle missing values
# For numerical columns, fill missing values with the median
for col in pt_icu_outcome_df.select_dtypes(include=['float64', 'int64']).columns:
    pt_icu_outcome_df[col].fillna(pt_icu_outcome_df[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode
for col in pt_icu_outcome_df.select_dtypes(include=['object']).columns:
    pt_icu_outcome_df[col].fillna(pt_icu_outcome_df[col].mode()[0], inplace=True)

# Remove duplicate rows
pt_icu_outcome_df.drop_duplicates(inplace=True)
pt_icu_outcome_df = pt_icu_outcome_df.drop(columns =[
    'hadm_id', 'icu_expire_flag', 'row_id','dob', 'admittime', 'dischtime',
    'ttd_days','dod','hospital_expire_flag', 'intime', 'los','hosp_deathtime',
    'outtime'])

In [450]:
# Handle missing values
# For numerical columns, fill missing values with the median
for col in icustays_df.select_dtypes(include=['float64', 'int64']).columns:
    icustays_df[col].fillna(icustays_df[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode
for col in icustays_df.select_dtypes(include=['object']).columns:
    icustays_df[col].fillna(icustays_df[col].mode()[0], inplace=True)

# Remove duplicate rows
icustays_df.drop_duplicates(inplace=True)
icustays_df = icustays_df.drop(columns =[
    'row_id', 'dbsource', 'hadm_id','first_careunit', 'last_careunit', 
    'first_wardid','last_wardid','intime','outtime', 'intime', 'los'])

In [451]:
# Filter records within the first 24 hours
vitals_first_24h = vitals_hourly_df[vitals_hourly_df['hr'] <= 24]
vitals_first_24h = vitals_first_24h[vitals_hourly_df['hr'] >=0]
# Aggregate the data (mean, min, max)
vitals_agg = vitals_first_24h.groupby('icustay_id').agg(['mean', 'min', 'max']).reset_index()

# Adjust the column names in the aggregated DataFrame
vitals_agg.columns = ['_'.join(col).rstrip('_') if col[0] != 'icustay_id' else col[0] for col in vitals_agg.columns]


In [452]:
# Filter records within the first 24 hours
labs_first_24h = labs_hourly_df[labs_hourly_df['hr'] <= 24]
labs_first_24h = labs_first_24h[labs_first_24h['hr'] >=0]
# Aggregate the data (mean, min, max)
labs_agg = labs_first_24h.groupby('icustay_id').agg(['mean', 'min', 'max']).reset_index()

# Adjust the column names in the aggregated DataFrame
labs_agg.columns = ['_'.join(col).rstrip('_') if col[0] != 'icustay_id' else col[0] for col in labs_agg.columns]


In [453]:
# Filter records within the first 24 hours
gcs_first_24h = gcs_hourly_df[gcs_hourly_df['hr'] <= 24]
gcs_first_24h = gcs_first_24h[gcs_first_24h['hr'] >= 0]

# Aggregate the data (mean, min, max)
gcs_agg = gcs_first_24h.groupby('icustay_id').agg(['mean', 'min','max']).reset_index()

# Adjust the column names in the aggregated DataFrame
gcs_agg.columns = ['_'.join(col).rstrip('_') if col[0] != 'icustay_id' else col[0] for col in gcs_agg.columns]


In [454]:
# Aggregate admissions_df to get the earliest admission for each subject_id
aggregated_admissions = admissions_df.sort_values(by=['subject_id', 'admittime']).drop_duplicates('subject_id', keep='last')



In [455]:
# Merge patients and ICU stays
merged_df = pd.merge(icustays_df, aggregated_admissions, on='subject_id', how='inner')
merged_df = pd.merge(merged_df, patients_df, on='subject_id', how='left')

In [456]:
merged_df['icustay_id'] = merged_df['icustay_id'].fillna(0).astype(int)

In [457]:
# Merge in aggregated vitals, labs, and GCS data
merged_df = pd.merge(merged_df, vitals_agg, on='icustay_id', how='left')
merged_df = pd.merge(merged_df, labs_agg, on='icustay_id', how='left')  
merged_df = pd.merge(merged_df, gcs_agg, on='icustay_id', how='left')  
# Merge in ICU outcomes
merged_df = pd.merge(merged_df, pt_icu_outcome_df, on='icustay_id', how='left')


In [458]:
# Drop columns not of importance
merged_df = merged_df.drop(columns =[
    'subject_id_x','hr_mean_x', 'hr_min_x','hr_max_x', 'hr_mean_y', 'hr_min_y',
    'hr_max_y','subject_id_y','expire_flag_y'])

In [459]:
# Convert 'gender' to bindary, 1 for 'M' and 0 for 'F'
merged_df['gender'] = merged_df['gender'].map({'M': 1, 'F': 0})


In [460]:
# Rename columns for easy interpretation
merged_df.rename(columns={'expire_flag_x': 'mortality', 'age_years'  : 'age'}, inplace=True)


In [462]:
merged_df_weeked = merged_df.copy()

In [463]:
categorical_cols = merged_df.select_dtypes(include=['object', 'category']).columns


In [464]:
def reduce_and_encode_top_categories(df, col, top_n=300):
    # Find the top n most frequent categories in the column
    top_categories = df[col].value_counts().nlargest(top_n).index
    
    # Function to replace categories not in the top n with 'Other'
    def reduce_categories(val):
        if val not in top_categories:
            return 'Other'
        return val
    
    # Reduce the categories in the column
    reduced_col_name = f'reduced_{col}'
    df[reduced_col_name] = df[col].apply(reduce_categories)
    
    # Apply one-hot encoding to the reduced category column
    df_encoded = pd.get_dummies(df, columns=[reduced_col_name], prefix=reduced_col_name)
    
    return df_encoded

In [465]:
columns_to_process = ['admission_type', 'admission_location', 'insurance','diagnosis']
for col in columns_to_process:
    merged_df = reduce_and_encode_top_categories(merged_df, col, top_n=300) 
merged_df.drop(['diagnosis', 'admission_type','admission_location','insurance'],axis=1, inplace=True)


In [466]:
# Convert 'admit_time' to datetime
merged_df_weeked['admittime'] = pd.to_datetime(merged_df_weeked['admittime'])

# Extract day of week from 'admit_time' (0=Monday, 6=Sunday)
merged_df_weeked['day_of_week'] = merged_df_weeked['admittime'].dt.dayofweek

# Identify weekend admissions (Saturday=5, Sunday=6)
merged_df_weeked['weekend_admission'] = merged_df_weeked['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)


In [467]:
merged_df.drop('admittime',axis=1, inplace=True)

In [468]:
merged_df_weeked.drop(['admittime','day_of_week'],axis=1, inplace=True)

In [441]:
# Save dataframe so we can load it into our other scripts
output_csv_path = '/Users\lukac\OneDrive\Desktop\HDAT-9910-Capstone/df_24Hrs.csv'
merged_df.to_csv(output_csv_path, index=False)

In [442]:
# Save dataframe so we can load it into our other scripts
#output_csv_path = '/Users\lukac\OneDrive\Desktop\HDAT-9910-Capstone/df_weekend.csv'
#merged_df_weeked.to_csv(output_csv_path, index=False)

In [470]:
# Save dataframe so we can load it into our other scripts
output_csv_path = '/Users\lukac\OneDrive\Desktop\HDAT-9910-Capstone/df_weekend_R.csv'
merged_df_weeked.to_csv(output_csv_path, index=False)