In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import concurrent.futures


In [55]:
# Create a function to loads through all CSV files
def read_all_datasets(data_folder):
    
    datasets = {}

    csv_files = [
        'vitals_hourly.csv', 'admissions.csv', 'antibiotics.csv', 'bloodculture.csv',
        'gcs_hourly.csv', 'icd9_diag.csv', 'icustays.csv', 'labs_hourly.csv',
        'output_hourly.csv', 'patients.csv', 'pt_icu_outcome.csv', 'pt_stay_hr.csv',
        'pt_weight.csv', 'pv_mechvent.csv', 'transfers.csv', 'vasopressors.csv'
    ]

    for file in csv_files:
        file_path = os.path.join(data_folder, file)

        if os.path.exists(file_path):
            datasets[file.replace('.csv', '')] = pd.read_csv(file_path)
        else:
            print(f"File not found: {file}")

    return datasets

data_folder = '/Users/zacharylukac/Desktop/HDAT-9910-Capstone/mimic_data/'
all_datasets = read_all_datasets(data_folder)

# Assign all files to a dataframe for exploration
admissions_df = all_datasets['admissions']
vitals_hourly_df = all_datasets['vitals_hourly']
antibiotics_df = all_datasets['antibiotics']
bloodculture_df = all_datasets['bloodculture']
gcs_hourly_df = all_datasets['gcs_hourly']
icd9_diag_df = all_datasets['icd9_diag']
icustays_df = all_datasets['icustays']
labs_hourly_df = all_datasets['labs_hourly']
pt_stay_hr_df = all_datasets['pt_stay_hr']
pt_icu_outcome_df = all_datasets['pt_icu_outcome']
patients_df = all_datasets['patients']
output_hourly_df = all_datasets['output_hourly']
pt_weight_df = all_datasets['pt_weight']
pv_mechvent_df = all_datasets['pv_mechvent']
transfers_df = all_datasets['transfers']
vasopressors_df = all_datasets['vasopressors']

In [56]:
def convert_columns_to_32bit(dataframe, columns):
    for col in columns:
        if col in dataframe.columns:
            if dataframe[col].dtype == 'int64':
                dataframe[col] = dataframe[col].astype('int32')
            elif dataframe[col].dtype == 'float64':
                dataframe[col] = dataframe[col].astype('float32')
    return dataframe


vitals_hourly_columns = ['icustay_id', 'hr', 'spo2', 'fio2', 'temperature', 'resprate', 'heartrate', 'sysbp', 'diasbp', 'glucose', 'meanarterialpressure']
labs_hourly_columns = ['icustay_id', 'hr', 'neutrophil', 'creactiveprotein', 'whitebloodcell', 'partialpressureo2', 'bicarbonate', 'lactate', 'troponin', 'bloodureanitrogen', 'creatinine', 'alaninetransaminase', 'aspartatetransaminase', 'hemoglobin', 'intnormalisedratio', 'platelets', 'albumin', 'chloride', 'glucose', 'sodium', 'bilirubin', 'hematocrit']
pt_icu_outcome_columns = ['row_id', 'subject_id', 'dob', 'hadm_id', 'admittime', 'dischtime', 'icustay_id', 'age_years', 'intime', 'outtime', 'los', 'hosp_deathtime', 'icu_expire_flag', 'hospital_expire_flag', 'dod', 'expire_flag', 'ttd_days']
gcs_hourly_columns = ['icustay_id', 'hr', 'gcs', 'gcseyes', 'gcsmotor', 'gcsverbal', 'endotrachflag']
pt_weight_columns = ['icustay_id', 'dy','starttime','endtime', 'admissionweight', 'dailyweight','previousweight','echoweight', 'avg_weight_naive', 'min_weight', 'max_weight']
output_hourly_columns = ['icustay_id', 'hr','urineoutput']


vitals_hourly_df = convert_columns_to_32bit(vitals_hourly_df, vitals_hourly_columns)
labs_hourly_df = convert_columns_to_32bit(labs_hourly_df, labs_hourly_columns)
pt_icu_outcome_df = convert_columns_to_32bit(pt_icu_outcome_df, pt_icu_outcome_columns)
gcs_hourly_df = convert_columns_to_32bit(gcs_hourly_df, gcs_hourly_columns)
pt_weight_df = convert_columns_to_32bit(pt_weight_df, pt_weight_columns)
output_hourly_df = convert_columns_to_32bit(output_hourly_df, output_hourly_columns)

In [85]:

# Subset the relevant time window (first 24 hours)
labs_hourly_df_first_24 = labs_hourly_df[labs_hourly_df['hr'] <= 24]
gcs_hourly_df_first_24 = gcs_hourly_df[gcs_hourly_df['hr'] <= 24]
output_hourly_df_first_24 = output_hourly_df[output_hourly_df['hr'] <= 24]
vitals_hourly_df_first_24 = vitals_hourly_df[vitals_hourly_df['hr'] <= 24]
pt_weight_df_first_24 = pt_weight_df[pt_weight_df['dy'] <= 1]  # Assuming 'dy' is days

# Select only indicative features
labs_columns = ['icustay_id','hr', 'neutrophil', 'whitebloodcell', 'partialpressureo2', 'bicarbonate',
                'lactate', 'troponin', 'bloodureanitrogen', 'creatinine', 
                'hemoglobin', 'intnormalisedratio', 'platelets', 'albumin', 'chloride', 'glucose', 'sodium',
                'hematocrit']

gcs_columns = ['icustay_id', 'hr','gcs', 'gcseyes', 'gcsmotor', 'gcsverbal']

output_columns = ['icustay_id','hr', 'urineoutput']

vitals_columns = ['icustay_id', 'hr', 'spo2', 'fio2', 'temperature', 'resprate', 'heartrate', 'sysbp', 'glucose',
                  'meanarterialpressure']

pt_weight_columns = ['icustay_id', 'dy', 'admissionweight', 'dailyweight', 'previousweight', 'echoweight',
                     'avg_weight_naive', 'min_weight', 'max_weight']

# Select only the relevant columns from each DataFrame
labs_subset = labs_hourly_df_first_24[labs_columns]
gcs_subset = gcs_hourly_df_first_24[gcs_columns]
output_subset = output_hourly_df_first_24[output_columns]
vitals_subset = vitals_hourly_df_first_24[vitals_columns]
pt_weight_subset = pt_weight_df_first_24[pt_weight_columns]






In [86]:
pt_icu_outcome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61533 entries, 0 to 61532
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   row_id                61533 non-null  int32  
 1   subject_id            61533 non-null  int32  
 2   dob                   61533 non-null  object 
 3   hadm_id               61533 non-null  int32  
 4   admittime             49185 non-null  object 
 5   dischtime             49185 non-null  object 
 6   icustay_id            61533 non-null  int32  
 7   age_years             61533 non-null  float32
 8   intime                61533 non-null  object 
 9   outtime               61523 non-null  object 
 10  los                   61523 non-null  float32
 11  hosp_deathtime        2277 non-null   object 
 12  icu_expire_flag       61533 non-null  int32  
 13  hospital_expire_flag  49185 non-null  float32
 14  dod                   24192 non-null  object 
 15  expire_flag        

In [110]:
# Merge based on common columns ('icustay_id' and 'hr')
merged_df = pd.merge(labs_hourly_sample[labs_columns], gcs_hourly_sample[gcs_columns], on=['icustay_id', 'hr'], how='left')
merged_df = pd.merge(merged_df, output_hourly_sample[output_columns], on=['icustay_id', 'hr'], how='left')
merged_df = pd.merge(merged_df, vitals_hourly_sample[vitals_columns], on=['icustay_id', 'hr'], how='left')
merged_df = pd.merge(merged_df, pt_weight_sample[pt_weight_columns], on=['icustay_id'], how='left')

# Merge with the outcome DataFrame to get 'expire_flag'
merged_df = pd.merge(merged_df, pt_icu_outcome_df[['icustay_id', 'expire_flag', 'age_years']], on='icustay_id', how='left')


In [111]:
merged_df.head()

Unnamed: 0,icustay_id,hr,neutrophil,whitebloodcell,partialpressureo2,bicarbonate,lactate,troponin,bloodureanitrogen,creatinine,...,dy,admissionweight,dailyweight,previousweight,echoweight,avg_weight_naive,min_weight,max_weight,expire_flag,age_years
0,215035,19,,,,,,,,,...,,,,,,,,,1,71.0
1,238217,-4,,7.6,,26.0,,,42.0,2.0,...,,,,,,,,,1,66.0
2,287284,20,,,,15.0,,,,,...,,,,,,,,,0,0.0
3,208582,20,,,96.0,,,,,,...,1.0,,59.099998,59.099998,,60.0,59.099998,60.900002,1,81.0
4,207388,8,,,72.0,,,,,,...,0.0,,,,99.790321,100.910126,99.790321,105.389351,0,91.400002


In [112]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78160 entries, 0 to 78159
Data columns (total 41 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   icustay_id            78160 non-null  int32  
 1   hr                    78160 non-null  int32  
 2   neutrophil            10400 non-null  float32
 3   whitebloodcell        31774 non-null  float32
 4   partialpressureo2     36776 non-null  float32
 5   bicarbonate           32198 non-null  float32
 6   lactate               19389 non-null  float32
 7   troponin              6402 non-null   float32
 8   bloodureanitrogen     31705 non-null  float32
 9   creatinine            31819 non-null  float32
 10  hemoglobin            41252 non-null  float32
 11  intnormalisedratio    24643 non-null  float32
 12  platelets             33505 non-null  float32
 13  albumin               6240 non-null   float32
 14  chloride              37567 non-null  float32
 15  glucose_x          

In [9]:
# imputing missing values with the mean
merged_df.fillna(merged_df.mean(), inplace=True)

In [70]:
merged_df.isnull().sum()

icustay_id                     0
neutrophil              79120937
whitebloodcell          53850039
partialpressureo2       42035368
bicarbonate             52684669
lactate                 64551529
troponin                81556974
bloodureanitrogen       52716393
creatinine              52570899
hemoglobin              41715656
intnormalisedratio      58616823
platelets               51657136
albumin                 80905646
chloride                45487456
glucose_x               31851056
sodium                  53430429
hematocrit              45034361
gcs                       531800
gcseyes                   696814
gcsmotor                  921514
gcsverbal                 816683
urineoutput               752631
hr                           334
spo2                     4500957
fio2                    81828532
temperature             50978904
resprate                 6159466
heartrate                5055944
sysbp                    6582883
glucose_y               59023586
meanarteri

In [71]:

# Define the path where you want to save the CSV file
output_csv_path = '/Users/zacharylukac/Desktop/HDAT-9910-Capstone/sampled_df_24Hrs.csv'

# Save the merged DataFrame to CSV
merged_df.to_csv(output_csv_path, index=False)

In [26]:
from sklearn.preprocessing import StandardScaler

numerical_columns = [
    'neutrophil', 'creactiveprotein', 'whitebloodcell', 'partialpressureo2', 'bicarbonate',
    'lactate', 'troponin', 'bloodureanitrogen', 'creatinine', 'alaninetransaminase',
    'aspartatetransaminase', 'hemoglobin', 'intnormalisedratio', 'platelets',
    'albumin', 'chloride', 'glucose_x', 'sodium', 'bilirubin', 'hematocrit',
    'gcs', 'gcseyes', 'gcsmotor', 'gcsverbal', 'urineoutput',
    'hr', 'spo2', 'fio2', 'temperature', 'resprate', 'heartrate',
    'sysbp', 'diasbp', 'glucose_y', 'meanarterialpressure',
    'dy', 'admissionweight', 'dailyweight', 'previousweight', 'echoweight',
    'avg_weight_naive', 'min_weight', 'max_weight'
]

# Select only the numerical columns
numerical_data = merged_df[numerical_columns]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical data
scaled_data = scaler.fit_transform(numerical_data)

# Create a new DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=numerical_columns)

# Combine the scaled numerical data with non-numerical columns (icustay_id, expire_flag)
scaled_df[['icustay_id', 'expire_flag']] = merged_df[['icustay_id', 'expire_flag']]

# scaled_df contains the scaled numerical features along with icustay_id and expire_flag


In [29]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Keep all relevant columns
X = scaled_df.drop(['expire_flag'], axis=1)
y = scaled_df['expire_flag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a RandomForestClassifier 
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[238718      4]
 [    26 193330]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    238722
           1       1.00      1.00      1.00    193356

    accuracy                           1.00    432078
   macro avg       1.00      1.00      1.00    432078
weighted avg       1.00      1.00      1.00    432078



In [33]:
model.score(X_test, y_test)

0.9999305680918723

In [31]:
merged_df.expire_flag.value_counts()

0    1193592
1     966797
Name: expire_flag, dtype: int64