In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
# Load data
path_omr    = 'MIMIC-IV v2.2/hosp/omr.csv'
df_omr      = pd.read_csv(path_omr)

In [3]:
# Convert data into floats
df_omr['chartdate'] = pd.to_datetime(df_omr['chartdate'])
df_omr['chartdate'] = df_omr['chartdate'].apply(lambda x: x.timestamp())

# Keep only the most recent result for each patient
df_omr = df_omr.sort_values(by=['subject_id', 'result_name', 'chartdate'], ascending=[True, True, False])
df_omr = df_omr.drop_duplicates(subset=['subject_id', 'result_name'], keep='first')

In [4]:
# Pivot the table such that each row is a patient and each column is a measurement
pivot_values_df = df_omr.pivot_table(index='subject_id', columns='result_name', values='result_value', aggfunc='first')
pivot_dates_df = df_omr.pivot_table(index='subject_id', columns='result_name', values='chartdate', aggfunc='first')

# Add dates to each measurements
pivot_dates_df = pivot_dates_df.add_suffix('_date')
pivot_dates_df.rename(columns={'subject_id_date': 'subject_id'}, inplace=True)
df = pd.merge(pivot_values_df, pivot_dates_df, on='subject_id')
df.columns.name = None

In [5]:
# Drop columns with more than 20% missing values or irrelevant
missing_values = df.isnull().sum() / len(df)
columns_to_drop = missing_values[missing_values > 0.2].index
df = df.drop(columns=columns_to_drop)

# Drop the entries where Weight or BP is missing
df = df.dropna(subset=['Weight (Lbs)'])
df = df.dropna(subset=['Blood Pressure'])
df = df.dropna(subset=['Height (Inches)'])

In [6]:
# Split blood pressure column into systolic and diastolic columns
df['Systolic BP'] = df['Blood Pressure'].apply(lambda x: x.split('/')[0])
df['Diastolic BP'] = df['Blood Pressure'].apply(lambda x: x.split('/')[1])
df.drop(columns=['Blood Pressure'], inplace=True)

# Convert selected numerical columns to float
numerical_columns = ['BMI (kg/m2)', 'Weight (Lbs)', 'Weight (Lbs)_date', 'Systolic BP', 'Diastolic BP', 'Blood Pressure_date', 'Height (Inches)_date', 'Height (Inches)']
df[numerical_columns] = df[numerical_columns].astype(float)

# Calculate missing BMI values
df['BMI (kg/m2)'] = df['BMI (kg/m2)'].fillna(df['Weight (Lbs)'] / (df['Height (Inches)'] ** 2) * 703)

# Fill in missing BMI dates with most recent height or weight dates
df['BMI (kg/m2)_date'] = df['BMI (kg/m2)_date'].fillna(df[['Weight (Lbs)_date', 'Height (Inches)_date']].max(axis=1))

# Include subject_id as a column
df.insert(0, 'subject_id', df.index)
df.reset_index(inplace=True, drop=True)

In [7]:
df.head()

Unnamed: 0,subject_id,BMI (kg/m2),Height (Inches),Weight (Lbs),BMI (kg/m2)_date,Blood Pressure_date,Height (Inches)_date,Weight (Lbs)_date,Systolic BP,Diastolic BP
0,10000032,18.2,60.0,93.03,6645802000.0,6643382000.0,6641914000.0,6645802000.0,98.0,66.0
1,10000117,18.9,64.0,110.0,6775574000.0,6775574000.0,6775574000.0,6775574000.0,108.0,74.0
2,10000635,34.8,67.0,222.1,5438275000.0,5438275000.0,5438275000.0,5438275000.0,148.0,63.0
3,10000719,37.0,67.0,236.0,5392138000.0,5392138000.0,5392138000.0,5392138000.0,144.0,88.0
4,10000826,22.3,68.0,147.0,5633280000.0,5633280000.0,5613754000.0,5633280000.0,100.0,62.0


In [8]:
df.dtypes

subject_id                int64
BMI (kg/m2)             float64
Height (Inches)         float64
Weight (Lbs)            float64
BMI (kg/m2)_date        float64
Blood Pressure_date     float64
Height (Inches)_date    float64
Weight (Lbs)_date       float64
Systolic BP             float64
Diastolic BP            float64
dtype: object

In [9]:
# Export CSV for later use
df.to_csv('processed_data/omr.csv', index=False)