# Exploratory Data Analysis - MIMIC-IV Dataset in PostgreSQL

The notebook has been implemented using Python 3.10.11.  
The MIMIC-IV v2.2 dataset has been loaded into PostgreSQL server running PostgreSQL 15.2 (Ubuntu 15.2-1.pgdg22.04+1).  
We suggest creating a virtual environment for this notebook.  
You need to install the following packages to run this notebook:

| Package Name | License                                                                                                                 | Documentation                           |
|--------------|-------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|
| psycopg2     | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL_v3-blue.svg)](https://www.gnu.org/licenses/lgpl-3.0)     | [Docs](https://www.psycopg.org/)        |
| pandas       | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) | [Docs](https://pandas.pydata.org/)      |
| numpy        | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) | [Docs](https://numpy.org/)              |
| seaborn      | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) | [Docs](https://seaborn.pydata.org/)     |
| scipy        | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) | [Docs](https://scipy.org/)              |
| tomli        | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)               | [Docs](https://github.com/hukkin/tomli) |
| tqdm         | [![License](https://img.shields.io/pypi/l/tqdm.svg)](https://github.com/tqdm/tqdm/blob/master/LICENCE)                  | [Docs](https://tqdm.github.io/)         |
| matplotlib   | [(BSD-compatible, PSF-based)](https://matplotlib.org/stable/users/project/license.html)                                 | [Docs](https://matplotlib.org/)         |


In [12]:
"""Update pip and install requirements."""
%pip install --upgrade pip
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [13]:
"""Relevant imports for EDA; setup and styling."""

# data manipulation
import numpy as np
import pandas as pd

# data vizualisation
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.gridspec import GridSpec

# tqdm for progress bars
from tqdm import tqdm

# default styling for plots
plt.style.use("ggplot")  # gnuplot style
rcParams["figure.figsize"] = 12, 6  # figure size
from matplotlib.colors import ListedColormap

# hls colormap for sns styled pie charts using matplotlib
hls = ListedColormap(sns.color_palette("hls").as_hex())


  

In [14]:
"""Functions for database connection, query execution, dataframe plotting."""

import tomli as toml
import psycopg2 as pg
from typing import Any


def read_config(path: str) -> dict:
    """Read config file and return config dict."""
    with open(path, "rb") as f:
        config = toml.load(f)["database"]
    return config


def connect_to_db(config: dict) -> Any:
    """Connect to database and return connection object."""
    conn = pg.connect(**config)
    cur = conn.cursor()
    return conn, cur


def read_sql(path: str) -> str:
    """Read SQL file and returns string"""
    with open(path, "r") as f:
        sql = f.read()
    return sql


def sql_to_df(path: str, params: dict = None) -> pd.DataFrame:
    """Read SQL file, execute query and return pandas DataFrame.
    
    Optionally, pass parameters to query using the params dict.
    """
    if params is None:
        params = {}
    conn, cur = connect_to_db(read_config("./config.toml"))
    cur.execute(read_sql(path), params)
    df = pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])
    conn.close()
    return df


def plot_corr_matrix(
    df: pd.DataFrame,
    method: str = "pearson",
    title: str="",
    figsize=(10, 5),
    linewidth=0.3,
    fmt=".2f",
    annot_kws={"size": 10},
    cmap="Spectral_r",
    cbar=True,
    ax=None,
    cbar_kws={"shrink": 0.8},
) -> None:
    """Plot heatmap of correlation matrix."""
    # set figure size
    if ax is None:
        plt.subplots(figsize=figsize)
    corr = df.corr(method)
    sns.heatmap(
        corr,
        cbar=cbar,  # show color bar? yes/no
        annot=True,  # show numbers in cells? yes/no
        square=True,  # square cells? yes/no
        linewidths=linewidth,  # linewidth between cells
        fmt=fmt,  # precision
        annot_kws=annot_kws,  # size of numbers in cells
        yticklabels=df.columns,  # y-axis labels
        xticklabels=df.columns,  # x-axis labels
        cmap=cmap,  # color palette
        ax=ax,  # axes object
        cbar_kws=cbar_kws,  # shrink color bar
    )
    if title:
        plt.title(title)
    if ax is None:
        plt.show()


def plot_boxplot_grid(df: pd.DataFrame, target: str) -> None:
    """Plot boxplots of multiple columns against a single target variable."""
    # calculate number of rows and columns
    n_cols = int(np.ceil(np.sqrt(len(df.columns) - 1)))
    n_rows = int(np.ceil((len(df.columns) - 1) / n_cols))
    # create figure and axes
    fig, axes = plt.subplots(
        nrows=n_rows, ncols=n_cols, figsize=(n_cols * 6, n_rows * 5)
    )
    # iterate over columns, rows and create boxplots
    for col, ax in zip(df.columns.drop(target), axes.flatten()):
        sns.boxplot(x=target, y=col, data=df, ax=ax)
        # set title to column name vs. target
        ax.set_title(f"{col} vs. {target}")
    plt.show()


def plot_corr_matrix_diff(
    df_one: pd.DataFrame,
    df_two: pd.DataFrame,
    method: str = "pearson",
    figsize=(10, 5),
    cmap="vlag",
    title="",
    ax=None,
) -> None:
    """Plot heatmap of difference of correlation matrices."""
    # calculate difference of correlation matrices
    corr_diff = df_one.corr(method) - df_two.corr(method)
    # plot heatmap
    plt.subplots(figsize=figsize)
    # draw arrows in cells according to correlation difference?
    sns.heatmap(
        corr_diff,
        annot=True,
        annot_kws={"size": 10},
        cbar=True,
        cmap=cmap,
        fmt=".2f",
        square=True,
        center=0,
        ax=ax,
    )
    plt.title(title)
    if ax is None:
        plt.show()


def plot_pie_chart(df, col="race", title="", ax=None, cmap=hls, explode=.1):
    """Plot pie chart for a given column in a dataframe."""
    explode = [explode] * len(df[col].value_counts())
    df[col].value_counts().plot.pie(
        shadow=True,
        autopct="%1.1f%%",
        startangle=90,
        title=title,
        cmap=cmap,
        ax=ax,
        labeldistance=1.1,
        pctdistance=0.5,
        explode=explode,
    )
    if ax is None:
        plt.show()

In [35]:
variables = None
variables_unfiltered = None
import threading

def getVariables():
    global variables
    variables = sql_to_df("./sql/variables_filtered.sql", {"window_size_h": 8, "window_stop_size_h": 2})
def getUnfiltered():
    global variables_unfiltered
    variables_unfiltered = sql_to_df("./sql/variables.sql")

x1 = threading.Thread(target=getVariables)
x2 = threading.Thread(target=getUnfiltered)
x1.start()
#x2.start()
x1.join()
#x2.join()




In [36]:
#from collections import defaultdict
#d = defaultdict(int)
#columns = variables.columns
#for i, row in variables.iterrows():
#    counter = 0
#    for c in columns:
#        x = pd.isnull(row[c])
#        if isinstance(x, bool):
#            if x:
#                counter = counter + 1
#        else:
#            if x.any():
#                counter = counter + 1
#    d[counter] += 1#

#print(d)



In [37]:
#dict(sorted(d.items(), key=lambda x: x[0]))

In [38]:
set = variables.fillna(-1) # this removes all rows currently xD we need to fill those
print(set)

trainingsset = set.drop(["sepsis"], axis=1)
labels = set["sepsis"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainingsset, labels, test_size=0.11, random_state=44, stratify=labels)

trainingsset_nan = variables.drop(["sepsis"], axis=1)
labels_nan = variables["sepsis"]
from sklearn.model_selection import train_test_split
X_train_nan, X_test_nan, y_train_nan, y_test_nan = train_test_split(trainingsset_nan, labels_nan, test_size=0.11, random_state=44, stratify=labels)


       subject_id  heart_rate_min  heart_rate_max  heart_rate_mean   
0        12466550            83.0           128.0       103.285714  \
1        13180007            72.0            90.0        83.333333   
2        18421337            -1.0            -1.0        -1.000000   
3        12207593            85.0            97.0        91.692308   
4        12980335            67.0            80.0        72.166667   
...           ...             ...             ...              ...   
73176    16180713            57.0            68.0        62.142857   
73177    15498623            69.0            72.0        70.714286   
73178    11256534            83.0           131.0        97.727273   
73179    15403458            98.0           117.0       110.333333   
73180    17840864            62.0            66.0        63.750000   

       heart_rate_std  sbp_min  sbp_max    sbp_mean    sbp_std  dbp_min  ...   
0           18.300403    109.0    155.0  125.500000  16.896745     55.0  ...  \

In [39]:
import threading
# import the regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier


rdmForest = RandomForestClassifier(random_state = 44) 
dcsTree = DecisionTreeRegressor(random_state = 44) 
regressor = LogisticRegression(random_state = 44) 
gBoost = GradientBoostingClassifier(random_state = 44) 
xtrTree = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=44, n_jobs=-1)
hist = HistGradientBoostingClassifier(random_state=44)
gNB = GaussianNB()
knc = KNeighborsClassifier()
v = VotingClassifier(estimators=[("RandomForestClassifier", rdmForest), ("GradientBoostingClassifier", gBoost), ("ExtraTreesClassifier", xtrTree), ("HistGradientBoostingClassifier", hist)], voting="hard")

def forestThread():
    # create a regressor object
    rdmForest.fit(X_train, y_train)
def xtraThread():
    xtrTree.fit(X_train, y_train)
def dcsTreeThread():
    dcsTree.fit(X_train, y_train)
def gBoostThread():
    gBoost.fit(X_train, y_train)
def regressorThread():
    regressor.fit(X_train, y_train)
def histThread():
    hist.fit(X_train_nan, y_train_nan)
def gNBThread():
    gNB.fit(X_train, y_train)
def kncThread():
    knc.fit(X_test, y_test)
def vThread():
    v.fit(X_test, y_test)

x1 = threading.Thread(target=forestThread)
x2 = threading.Thread(target=dcsTreeThread)
x3 = threading.Thread(target=gBoostThread)
x4 = threading.Thread(target=regressorThread)
x5 = threading.Thread(target=histThread)
x6 = threading.Thread(target=gNBThread)
x7 = threading.Thread(target=kncThread)
x8 = threading.Thread(target=vThread)

x1.start()
x2.start()
x3.start()
x4.start()
x5.start()
x6.start()
x7.start()
x8.start()

x1.join()
x2.join()
x3.join()
x4.join()
x5.join()
x6.join()
x7.join()
x8.join()

print("extra")
xtraThread()








extra


In [40]:

y_pred_forest = rdmForest.predict(X_test)
y_pred_xtr = xtrTree.predict(X_test)
y_pred_dcsTree = dcsTree.predict(X_test)
y_pred_gBoost = gBoost.predict(X_test)
y_pred_regressor = regressor.predict(X_test)
y_pred_hist = hist.predict(X_test_nan)
y_pred_gNB = gNB.predict(X_test)
y_pred_knc = knc.predict(X_test)
y_pred_v = v.predict(X_test)


In [41]:
from sklearn.metrics import matthews_corrcoef
forest_mcc = matthews_corrcoef(y_test, y_pred_forest)
xtr_mcc = matthews_corrcoef(y_test, y_pred_xtr)
dcsTree_mcc = matthews_corrcoef(y_test, y_pred_dcsTree)
gBoost_mcc = matthews_corrcoef(y_test, y_pred_gBoost)
regressor_mcc = matthews_corrcoef(y_test, y_pred_regressor)
hist_mcc = matthews_corrcoef(y_test_nan, y_pred_hist)
gNB_mcc = matthews_corrcoef(y_test, y_pred_gNB)
knc_mcc = matthews_corrcoef(y_test, y_pred_knc)
v_mcc = matthews_corrcoef(y_test, y_pred_v)


from sklearn.metrics import roc_auc_score
forest_roc = roc_auc_score(y_test, y_pred_forest)
xtr_roc = roc_auc_score(y_test, y_pred_xtr)
dcsTree_roc = roc_auc_score(y_test, y_pred_dcsTree)
gBoost_roc = roc_auc_score(y_test, y_pred_gBoost)
regressor_roc = roc_auc_score(y_test, y_pred_regressor)
hist_roc = roc_auc_score(y_test_nan, y_pred_hist)
gNB_roc = roc_auc_score(y_test, y_pred_gNB)
knc_roc = roc_auc_score(y_test, y_pred_knc)
v_roc = roc_auc_score(y_test, y_pred_v)


In [42]:
data = [[forest_mcc, forest_roc], [xtr_mcc, xtr_roc], [dcsTree_mcc, dcsTree_roc], [gBoost_mcc, gBoost_roc], [regressor_mcc, regressor_roc], [hist_mcc, hist_roc], [gNB_mcc, gNB_roc], [knc_mcc, knc_roc], [v_mcc, v_roc]]
df = pd.DataFrame(data, columns=["MCC", "ROC"], index=["RandomForestClassifier", "ExtraTreesClassifier", "DecisionTreeRegressor", "GradientBoostingClassifier", "LogisticRegression", "HistGradientBoostingClassifier", "GaussianNB", "KNeighborsClassifier", "Voting"])
print(df)

                                     MCC       ROC
RandomForestClassifier          0.731950  0.856081
ExtraTreesClassifier            0.726524  0.853721
DecisionTreeRegressor           0.629145  0.815064
GradientBoostingClassifier      0.770212  0.863860
LogisticRegression              0.000000  0.500000
HistGradientBoostingClassifier  0.773300  0.866964
GaussianNB                      0.594110  0.749104
KNeighborsClassifier            0.382642  0.689179
Voting                          0.854306  0.915495
