In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
# Create header (first write to CSV) and template (for consecutive chunk reads) for df
path_labitems   = 'MIMIC-IV v2.2/hosp/d_labitems.csv'
labitems        = pd.read_csv(path_labitems)

template_id     = labitems.pivot_table(index='label', columns='itemid', aggfunc='size')
template_name   = labitems.pivot_table(index='label', columns='itemid', aggfunc='size')

In [11]:
# Create template with ids
template_id.columns.name = None
template_id.index.name = None
template_id.insert(0, 'subject_id', 0)
template_id = template_id.iloc[0:0]

In [12]:
# Create template with names and write it to file
for i in template_name.columns:
    template_name.rename(columns={i: labitems.set_index('itemid').loc[int(i)]['label']}, inplace=True)
template_name.columns.name = None
template_name.index.name = 'subject_id'
template_name = template_name.iloc[0:0]
output_file_path = 'processed_data/labevents.csv'
template_name.to_csv(output_file_path)

In [13]:
# Load data
path_labevents  = path_labevents  = 'MIMIC-IV v2.2/hosp/labevents.csv'
reader          = pd.read_csv(path_labevents, chunksize=10000000)

In [14]:
i = 0
while True:
    try:
        df_labevents = next(reader)
    except StopIteration:
        print("No more chunks left.")
        break

    if i == 10:
        break

    # Keep most recent lab values for each subject
    df = df_labevents.sort_values(by=['subject_id', 'itemid', 'charttime'], ascending=[True, True, False])
    df.drop_duplicates(subset=['subject_id', 'itemid'], keep='first', inplace=True)

    # Drop irrelevant columns
    df = df[['subject_id', 'itemid', 'valuenum']]

    # Reformat table
    df.reset_index(drop=True, inplace=True)
    df = df.pivot_table(index='subject_id', columns='itemid', values='valuenum', aggfunc='mean')
    df.insert(0, 'subject_id', df.index)
    df.reset_index(drop=True, inplace=True)
    
    # Update df formatting to match header
    missing_columns = set(template_id.columns) - set(df.columns)
    new_columns = {col: pd.Series([pd.NA] * len(df), name=col) for col in missing_columns}
    new_columns_df = pd.concat(new_columns.values(), axis=1)
    df = pd.concat([df, new_columns_df], axis=1)

    df.to_csv(output_file_path, mode='a', header=False, index=False)

    i += 1

    print(i)