This notebook converts the csv files to pickle files for quick processing in python.
Prerequisites: all the csv files of the mimics-iii database is dumped into a folder

In [3]:
from glob import glob
import pandas as pd
import os

folder_containing_mimics_csv_files = os.environ.get("MIMICSPATH")


In [None]:

filelist = sorted(glob(f"{folder_containing_mimics_csv_files}/*.csv"), key=os.path.getsize)
print(filelist)
for file_path in filelist:
    os.makedirs("../data",exist_ok=True)
    op_file = os.path.join("../data/", os.path.basename(file_path).replace(".csv", ".pkl"))
    print(f"Processing {file_path}...")
    if os.path.exists(op_file):
        print(f"Skipping {file_path}, already processed.")
        continue

    if "ADMISSIONS.csv" in file_path:
        parse_cols =  ["ADMITTIME", "DISCHTIME"]
    else:
        parse_cols = None
        
    chunks = pd.read_csv(file_path, parse_dates=parse_cols, chunksize=10000)
    all_chunks = []
    for chunk in chunks:
        all_chunks.append(chunk)
    combined_df = pd.concat(all_chunks)
    if "HADM_ID" in combined_df.columns:
        combined_df = combined_df[~combined_df.HADM_ID.isna()]
        combined_df["HADM_ID"] = combined_df["HADM_ID"].astype(int)
    combined_df.to_pickle(op_file)


['/data/users/asusaiyah/datasets/mimics-iii/CAREGIVERS.csv', '/data/users/asusaiyah/datasets/mimics-iii/PATIENTS.csv', '/data/users/asusaiyah/datasets/mimics-iii/SERVICES.csv', '/data/users/asusaiyah/datasets/mimics-iii/PROCEDURES_ICD.csv', '/data/users/asusaiyah/datasets/mimics-iii/ADMISSIONS.csv', '/data/users/asusaiyah/datasets/mimics-iii/TRANSFERS.csv', '/data/users/asusaiyah/datasets/mimics-iii/PROCEDUREEVENTS_MV.csv', '/data/users/asusaiyah/datasets/mimics-iii/MICROBIOLOGYEVENTS.csv', '/data/users/asusaiyah/datasets/mimics-iii/OUTPUTEVENTS.csv', '/data/users/asusaiyah/datasets/mimics-iii/PRESCRIPTIONS.csv', '/data/users/asusaiyah/datasets/mimics-iii/INPUTEVENTS_MV.csv', '/data/users/asusaiyah/datasets/mimics-iii/LABEVENTS.csv', '/data/users/asusaiyah/datasets/mimics-iii/INPUTEVENTS_CV.csv', '/data/users/asusaiyah/datasets/mimics-iii/NOTEEVENTS.csv']
Processing /data/users/asusaiyah/datasets/mimics-iii/CAREGIVERS.csv...
Skipping /data/users/asusaiyah/datasets/mimics-iii/CAREGIVERS

In [6]:
# Display all columns in all the files
import os
import pandas as pd

# Path to the folder containing pickle files
folder_path = "../data"

# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".pkl"):  # Check if the file is a pickle file
        file_path = os.path.join(folder_path, file_name)
        try:
            # Load the pickle file into a DataFrame
            df = pd.read_pickle(file_path)
            print(f"Columns in {file_name}:")
            print(df.columns.tolist())
            print()
        except Exception as e:
            print(f"Error loading {file_name}: {e}")


Columns in CAREGIVERS.pkl:
['ROW_ID', 'CGID', 'LABEL', 'DESCRIPTION']

Columns in PATIENTS.pkl:
['ROW_ID', 'SUBJECT_ID', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG']

Columns in SERVICES.pkl:
['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'TRANSFERTIME', 'PREV_SERVICE', 'CURR_SERVICE']

Columns in PROCEDURES_ICD.pkl:
['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE']

Columns in ADMISSIONS.pkl:
['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA']

Columns in TRANSFERS.pkl:
['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'DBSOURCE', 'EVENTTYPE', 'PREV_CAREUNIT', 'CURR_CAREUNIT', 'PREV_WARDID', 'CURR_WARDID', 'INTIME', 'OUTTIME', 'LOS']

Columns in PROCEDUREEVENTS_MV.pkl:
['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTTIME', 'E