In [10]:
import pandas as pd
import time
import psutil
from tqdm.auto import tqdm  
import dask.dataframe as dd
import os

Read admission, diagnosis_icd (icd codes given to each patient each stay), and d_idc_diagnosis (disease name for each code)

In [11]:
# admissions
admission=pd.read_csv('admissions.csv', engine='python', on_bad_lines='warn')
print("The number of admission records in CSV File")
print (len(admission)) #431231

The number of admission records in CSV File
431231


In [169]:

# ICD codes given to each patient
patient_icd_diagnosis=pd.read_csv('diagnoses_icd-2.csv', engine='python', on_bad_lines='warn')
print("The number of diagnoses given to all patients")
print (len(patient_icd_diagnosis)) #4756326

The number of diagnoses given to all patients
4756326


In [None]:
#get counts for each code
fruit_counts =patient_icd_diagnosis['icd_code'].value_counts()
fruit_counts=fruit_counts.head(20)
print(fruit_counts)

In [143]:
# Disease names for each ICD code
disease_icd=pd.read_csv('d_icd_diagnoses.csv', engine='python', on_bad_lines='warn')
print("The number of icd_diagnoses")
print (len(disease_icd)) #109775

The number of icd_diagnoses
109775


In [199]:
#print the disease for each code
app=disease_icd[disease_icd['icd_code']=='25000']
print(app)

     icd_code  icd_version                                         long_title
2332    25000            9  Diabetes mellitus without mention of complicat...


Merging & Filtering Some Admissions, ICD codes, and the Diagnosis for each code

In [165]:
# admissions and patient icd codes
merge_tables=pd.merge(admission, patient_icd_diagnosis, on=['subject_id', 'hadm_id'], how='inner')


In [200]:
# Congestive Heart Failure, unspecified = 4280
# unspecified essential hypertension = 4019
filter_merged_tables=merge_tables[(merge_tables['icd_code']== '4280') | (merge_tables['icd_code']== '25000')] 


In [None]:
distinct_values = filter_merged_tables['subject_id']

print("Distinct values in the column:")
print(len(distinct_values))

print(filter_merged_tables[['subject_id', 'hadm_id', 'icd_code']].head(20))

In [None]:
#shuffle
shuffled_df =filter_merged_tables.sample(frac=1).reset_index(drop=True)

print("Shuffled DataFrame:")
print(shuffled_df[['subject_id', 'hadm_id', 'icd_code']].head(20))

In [203]:
# disease names for each code
filters_tables=pd.merge(filter_merged_tables, disease_icd, on=['icd_code', 'icd_version'], how='inner')


Reading lab events and presciption files 

In [204]:
# Prescriptions-read the dataset using dask  Row count=15416708
prescriptions_path = 'prescriptions.csv'

options={'dose_val_rx': 'object',
       'form_rx': 'object',
       'form_val_disp': 'object',
       'gsn': 'object',
       'ndc': 'float64',
       'poe_seq': 'float64'}



file_size = os.path.getsize(prescriptions_path)

# Set the desired block size for reading the file
# You can adjust this value based on the size of your file and available memory
block_size = 10 * 1024 * 1024  # 10 MB

# Calculate the total number of blocks
total_blocks = (file_size + block_size - 1) // block_size

with tqdm(total=total_blocks, unit='block') as pbar:
    prescriptions = dd.read_csv(prescriptions_path, blocksize=block_size, dtype=options)
    for _ in prescriptions.to_delayed():
        pbar.update(1)
#print(len(prescriptions))

100%|█████████▉| 252/253 [00:00<00:00, 50757.04block/s]


In [207]:
#print(prescriptions.head(2))
prescription_merged=dd.merge(filters_tables, prescriptions, on=['subject_id', 'hadm_id'], how='inner')
print(prescription_merged.head(2))

   subject_id   hadm_id            admittime            dischtime deathtime  \
0    10000635  26134563  2136-06-19 14:24:00  2136-06-20 11:30:00      <NA>   
1    10000635  26134563  2136-06-19 14:24:00  2136-06-20 11:30:00      <NA>   

           admission_type admit_provider_id admission_location  \
0  AMBULATORY OBSERVATION            P611A0     PROCEDURE SITE   
1  AMBULATORY OBSERVATION            P611A0     PROCEDURE SITE   

  discharge_location insurance  ...     gsn           ndc    prod_strength  \
0               <NA>     Other  ...  041660  5.539000e+10         1mg Vial   
1               <NA>     Other  ...  001275  2.450041e+08  10mEq ER Tablet   

  form_rx dose_val_rx  dose_unit_rx  form_val_disp form_unit_disp  \
0     NaN           1            mg              1           VIAL   
1     NaN          20           mEq              2            TAB   

   doses_per_24_hrs route  
0               NaN    IM  
1               NaN    PO  

[2 rows x 39 columns]


In [208]:
# Lab Events-read the dataset using dask  Row count=118171367
labevents_path = 'labevents.csv'

file_size = os.path.getsize(labevents_path)

# Set the desired block size for reading the file
# You can adjust this value based on the size of your file and available memory
block_size = 10 * 1024 * 1024  # 10 MB

# Calculate the total number of blocks
total_blocks = (file_size + block_size - 1) // block_size

with tqdm(total=total_blocks, unit='block') as pbar:
    labevents = dd.read_csv(labevents_path, blocksize=block_size)
    for _ in labevents.to_delayed():
        pbar.update(1)
#labevents = labevents.dropna(subset=['hadm_id'])

#labevents['hadm_id']=labevents['hadm_id'].astype('int64')


100%|█████████▉| 1309/1310 [00:00<00:00, 50918.55block/s]


In [209]:
labevents_merged=dd.merge(prescription_merged, labevents, on=['subject_id', 'hadm_id'], how='inner')
# print(labevents_merged.columns)
# print(labevents_merged.head(1))

+------------------------+------------+-------------+
| Merge columns          | left dtype | right dtype |
+------------------------+------------+-------------+
| ('hadm_id', 'hadm_id') | int64      | float64     |
+------------------------+------------+-------------+
Cast dtypes explicitly to avoid unexpected results.


In [210]:
print(labevents.head(2))

   labevent_id  subject_id  hadm_id  specimen_id  itemid order_provider_id  \
0            1    10000032      NaN     45421181   51237            P28Z0X   
1            2    10000032      NaN     45421181   51274            P28Z0X   

             charttime            storetime value  valuenum valueuom  \
0  2180-03-23 11:51:00  2180-03-23 15:15:00   1.4       1.4      NaN   
1  2180-03-23 11:51:00  2180-03-23 15:15:00   ___      15.1      sec   

   ref_range_lower  ref_range_upper      flag priority   comments  
0              0.9              1.1  abnormal  ROUTINE        NaN  
1              9.4             12.5  abnormal  ROUTINE  VERIFIED.  


In [211]:
# D_labitems-read the dataset using dask  Row count=1622
d_labitems_path = 'd_labitems.csv'

file_size = os.path.getsize(d_labitems_path)

# Set the desired block size for reading the file
# You can adjust this value based on the size of your file and available memory
block_size = 10 * 1024 * 1024  # 10 MB

# Calculate the total number of blocks
total_blocks = (file_size + block_size - 1) // block_size

with tqdm(total=total_blocks, unit='block') as pbar:
    d_labitems = dd.read_csv(d_labitems_path, blocksize=block_size)
    for _ in prescriptions.to_delayed():
        pbar.update(1)



252block [00:00, 35243.90block/s]       


In [230]:
# merged labitems

labitems_merged=dd.merge(labevents_merged, d_labitems, on=['itemid'], how='inner')
#print(labitems_merged.head(2))


In [None]:
print(len(labitems_merged))

In [None]:
# Get the number of rows in the Dask DataFrame
num_rows = labitems_merged.map_partitions(len).sum().compute()

print("Number of rows:", num_rows)

In [231]:
#remove unneccessary columns
columns_drop=['deathtime','hospital_expire_flag','seq_num','icd_version','order_provider_id_x','admit_provider_id', 'poe_id', 'poe_seq', 'starttime', 'stoptime', 'order_provider_id_y']
mimic_table = labitems_merged.drop(columns_drop, axis=1)

In [None]:
# Remove Race
columns_drop=['race']

mimic_table_NO_race=labitems_merged.drop(columns_drop, axis=1)

In [None]:
# Remove Insurance
columns_drop=['insurance']

mimic_table_NO_insurance=labitems_merged.drop(columns_drop, axis=1)

In [232]:
# Remove both insurance and race
columns_drop=['race', 'insurance']

mimic_table_NO_race_insurance=labitems_merged.drop(columns_drop, axis=1)

SAVE CSV FILES

In [None]:
# Count occurrences of both diseases in the first 20K
first_20k_rows = labitems_merged.head(20000)
diabetes = (first_20k_rows['icd_code'] == '25000').sum()
print("Number of patients with diabetes in first 20,000 rows:", diabetes)
congestive=  (first_20k_rows['icd_code'] == '4280').sum()
print("Number of patients with congestive heart failure in the first 20,000 rows:", congestive)

In [235]:
#EVERTHYING - BIAS
partition_sizes = [20000, 15000, 5000]

# Save partitions to separate files
start_index = 0
for i, size in enumerate(partition_sizes):
    end_index = start_index + size
    partition = mimic_table_NO_race_insurance.partitions[start_index:end_index]
    partition.to_csv(f'MIMIC_IV_NoBias{size}.csv', index=False)
    start_index = end_index
print("Files Saved")

KeyboardInterrupt: 

In [None]:
#EVERTHYING WITH BOTH BIAS
partition_sizes = [20000, 15000, 5000]

# Save partitions to separate files
start_index = 0
for i, size in enumerate(partition_sizes):
    end_index = start_index + size
    partition = mimic_table.partitions[start_index:end_index]
    partition.to_csv(f'MIMIC_IV_WithBias{size}.csv', index=False)
    start_index = end_index

In [None]:
#EVERTHYING + INSURANCE
partition_sizes = [20000, 15000, 5000]

# Save partitions to separate files
start_index = 0
for i, size in enumerate(partition_sizes):
    end_index = start_index + size
    partition = mimic_table_NO_race.partitions[start_index:end_index]
    partition.to_csv(f'MIMIC_IV_Insurance{size}.csv', index=False)
    start_index = end_index

In [None]:
#EVERTHYING + RACE
partition_sizes = [20000, 15000, 5000]

# Save partitions to separate files
start_index = 0
for i, size in enumerate(partition_sizes):
    end_index = start_index + size
    partition = mimic_table_NO_insurance.partitions[start_index:end_index]
    partition.to_csv(f'MIMIC_IV_Race{size}.csv', index=False)
    start_index = end_index

In [213]:
# Train - First 20,000 rows. Save to CSV File
train_records=labitems_merged.head(40)
#train_records.loc[:,'DIAGNOSIS']=train_records['DIAGNOSIS'].apply(lambda a: a if a in most_common else 'Other')
train_records.to_csv('MIMIC_IV_train.csv', index=False) 
print("File saved.") 
print(len(train_records))    

File saved.
40


In [None]:
# Validation - 10,000 rows. Save to CSV File
validation_records=merge_tables.iloc[20000:30000]
#validation_records.loc[:,'DIAGNOSIS']=validation_records['DIAGNOSIS'].apply(lambda a: a if a in most_common else 'Other')
validation_records.to_csv('MIMIC_IV_validation.csv', index=False)
print("File saved")
print(len(validation_records))


In [None]:
# Test - 5,000 rows. Save to CSV File
test_records=merge_tables.iloc[30000:35000]
#test_records.loc[:,'DIAGNOSIS']=test_records['DIAGNOSIS'].apply(lambda a: a if a in most_common else 'Other')
test_records.to_csv('MIMIC_IV_validation.csv', index=False)
print("File saved")
print(len(test_records))

