In [1]:
import pandas as pd
import time
import psutil
from tqdm.auto import tqdm  # Import tqdm for progress bar

In [2]:
print_interval = 60
# Start the timer
start_time = time.time()
# Get total system memory
total_memory = psutil.virtual_memory().total  # in bytes

# Set initial chunk size (e.g., 1% of total memory)
chunk_size = total_memory // 100
print(chunk_size)
# Initialize the total rows read
total_rows_read = 0

343597383


Read admission, diagnosis_icd (icd codes given to each patient each stay), and d_idc_diagnosis (disease name for each code)

In [3]:
# admissions
admission=pd.read_csv('admissions.csv', engine='python', on_bad_lines='warn')
print("The number of admission records in CSV File")
print (len(admission)) #431231

The number of admission records in CSV File
431231


In [4]:

# ICD codes given to each patient
patient_icd_diagnosis=pd.read_csv('diagnoses_icd-2.csv', engine='python', on_bad_lines='warn')
print("The number of diagnoses given to all patients")
print (len(patient_icd_diagnosis)) #4756326

The number of diagnoses given to all patients
4756326


In [5]:
# Disease names for each ICD code
disease_icd=pd.read_csv('d_icd_diagnoses.csv', engine='python', on_bad_lines='warn')
print("The number of icd_diagnoses")
print (len(disease_icd)) #109775

The number of icd_diagnoses
109775


Merging & Filtering Some Of The Tables

In [6]:
# admissions and patient icd codes
merge_tables=pd.merge(admission, patient_icd_diagnosis, on=['subject_id', 'hadm_id'], how='inner')


In [7]:
# acute kidney failure, unspecified ICD code = 5849
# Diabetes mellitus without mention of complication, Type II or unspecified ICD code = 25000
filter_merged_tables=merge_tables[(merge_tables['icd_code']== '5849') | (merge_tables['icd_code']== '25000')] 


In [8]:
# disease names for each code
filters_tables=pd.merge(filter_merged_tables, disease_icd, on=['icd_code', 'icd_version'], how='inner')


In [11]:

print("Example of database")
print(filters_tables.head(2))
print(filters_tables.columns)

Example of database
   subject_id   hadm_id            admittime            dischtime deathtime  \
0    10000635  26134563  2136-06-19 14:24:00  2136-06-20 11:30:00       NaN   
1    10000980  24947999  2190-11-06 20:57:00  2190-11-08 15:58:00       NaN   

           admission_type admit_provider_id admission_location  \
0  AMBULATORY OBSERVATION            P611A0     PROCEDURE SITE   
1                EW EMER.            P434W4     EMERGENCY ROOM   

  discharge_location insurance language marital_status  \
0                NaN     Other  ENGLISH        WIDOWED   
1   HOME HEALTH CARE  Medicare  ENGLISH        MARRIED   

                     race            edregtime            edouttime  \
0  BLACK/AFRICAN AMERICAN                  NaN                  NaN   
1  BLACK/AFRICAN AMERICAN  2190-11-06 15:30:00  2190-11-06 23:16:00   

   hospital_expire_flag  seq_num icd_code  icd_version  \
0                     0        2    25000            9   
1                     0        3    25

Reading lab events and presciption files

In [None]:
# Lab Events-read the dataset in chunks

all_merged_chunks=pd.DataFrame()

for chunk in tqdm(pd.read_csv('labevents.csv', chunksize=chunk_size), desc="Processing chunks"):
    # Update total rows read
    total_rows_read += len(chunk)
    
    merged_chunk= pd.merge(filters_tables, chunk, on=['subject_id', 'hadm_id'], how='inner')
    all_merged_chunks=pd.concat([all_merged_chunks, merged_chunk])

    # Check if it's time to print elapsed time
    if time.time() - start_time > print_interval:
        elapsed_time = time.time() - start_time
        print(f"Elapsed time: {elapsed_time} seconds")
        print_interval += 60  # Increase the print interval to every minute for the next print    

# end the timer
end_time = time.time()
# calculate the total elapsed time
total_elapsed_time = end_time - start_time
print(f"Total time taken to read {total_rows_read} rows: {total_elapsed_time} seconds")
print(f"Total number of rows is: {total_rows_read}") #?


In [12]:
# Lab Events option 2-read the dataset in chunks

all_merged_chunks=pd.DataFrame()

with tqdm(total=100) as pbar:  # Initialize tqdm progress bar
    while True:
        # Read a chunk of the dataset
        chunk = pd.read_csv('labevents.csv', chunksize=chunk_size) 
        for chunk_df in chunk:
            # Update total rows read
            total_rows_read += len(chunk_df)
        
            merged_chunk= pd.merge(filters_tables, chunk_df, on=['subject_id', 'hadm_id'], how='inner')
            all_merged_chunks=pd.concat([all_merged_chunks, merged_chunk])
            pbar.update(1)


  0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Lab Items-read the dataset in chunks

lab_merged_chunks=pd.DataFrame()

for chunk in tqdm(pd.read_csv('d_labitems.csv', chunksize=chunk_size), desc="Processing chunks"):
    # Update total rows read
    total_rows_read += len(chunk)
    
    merged_chunk= pd.merge(all_merged_chunks, chunk, on=['itemid'], how='inner')
    lab_merged_chunks=pd.concat([lab_merged_chunks, merged_chunk])

    # Check if it's time to print elapsed time
    if time.time() - start_time > print_interval:
        elapsed_time = time.time() - start_time
        print(f"Elapsed time: {elapsed_time} seconds")
        print_interval += 60  # Increase the print interval to every minute for the next print    

# end the timer
end_time = time.time()
# calculate the total elapsed time
total_elapsed_time = end_time - start_time
print(f"Total time taken to read {total_rows_read} rows: {total_elapsed_time} seconds")
print(f"Total number of rows is: {total_rows_read}") #?


In [None]:
#Lab items option 2

# Lab Items-read the dataset in chunks
lab_merged_chunks=pd.DataFrame()

with tqdm(total=100) as pbar:  # Initialize tqdm progress bar
    while True:
        # Read a chunk of the dataset
        chunk = pd.read_csv('d_labitems.csv', chunksize=chunk_size) 
        for chunk_df in chunk:
            # Update total rows read
            total_rows_read += len(chunk_df)
    
            merged_chunk= pd.merge(all_merged_chunks, chunk_df, on=['itemid'], how='inner')
            lab_merged_chunks=pd.concat([lab_merged_chunks, merged_chunk])
            pbar.update(1)  


In [None]:
# Prescriptions option 2- read the dataset in chunks
p_merged_chunks=pd.DataFrame()

with tqdm(total=100) as pbar:  # Initialize tqdm progress bar
    while True:
        # Read a chunk of the dataset
        chunk = pd.read_csv('prescriptions.csv', chunksize=chunk_size) 
        for chunk_df in chunk:
            # Update total rows read
            total_rows_read += len(chunk_df)

            merged_chunk= pd.merge(lab_merged_chunks, chunk_df, on=['subject_id', 'hadm_id'], how='inner')
            p_merged_chunks=pd.concat([p_merged_chunks, merged_chunk])
            pbar.update(1)

In [None]:
# Prescriptions- read the dataset in chunks
p_merged_chunks=pd.DataFrame()

for chunk in tqdm(pd.read_csv('prescriptions.csv', chunksize=chunk_size), desc="Processing chunks"):
    # Update total rows read
    total_rows_read += len(chunk)
    merged_chunk= pd.merge(lab_merged_chunks, chunk, on=['subject_id', 'hadm_id'], how='inner')
    p_merged_chunks=pd.concat([p_merged_chunks, merged_chunk])

    # Check if it's time to print elapsed time
    if time.time() - start_time > print_interval:
        elapsed_time = time.time() - start_time
        print(f"Elapsed time: {elapsed_time} seconds")
        print_interval += 60  # Increase the print interval to every minute for the next print    

# end the timer
end_time = time.time()
# calculate the total elapsed time
total_elapsed_time = end_time - start_time
print(f"Total time taken to read {total_rows_read} rows: {total_elapsed_time} seconds")
print(f"Total number of rows is: {total_rows_read}") #?


In [None]:


print("Example of database")
print(all_merged_chunks.head(2))
print(all_merged_chunks.columns)


Example of database


In [None]:
# Train - First 20,000 rows. Save to CSV File
train_records=filter_merged_tables.iloc[:20000]
#train_records.loc[:,'DIAGNOSIS']=train_records['DIAGNOSIS'].apply(lambda a: a if a in most_common else 'Other')
train_records.to_csv('MIMIC_IV_train.csv', index=False) 
print("File saved.") 
print(len(train_records))      
            


In [None]:
# Validation - 10,000 rows. Save to CSV File
validation_records=merge_tables.iloc[20000:30000]
#validation_records.loc[:,'DIAGNOSIS']=validation_records['DIAGNOSIS'].apply(lambda a: a if a in most_common else 'Other')
validation_records.to_csv('MIMIC_IV_validation.csv', index=False)
print("File saved")
print(len(validation_records))


In [None]:
# Test - 5,000 rows. Save to CSV File
test_records=merge_tables.iloc[30000:35000]
#test_records.loc[:,'DIAGNOSIS']=test_records['DIAGNOSIS'].apply(lambda a: a if a in most_common else 'Other')
test_records.to_csv('MIMIC_IV_validation.csv', index=False)
print("File saved")
print(len(test_records))

