In [30]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [31]:
#DEFINE FILEPATH
hosp='W:\\Main_project\\mimic-iv-2.2\\mimic-iv-2.2\\hosp\\'
icu='W:\\Main_project\\mimic-iv-2.2\\mimic-iv-2.2\\icu\\'
base='W:\\Main_project\\base_files\\'

In [3]:
pat_adm=pd.read_csv(f'{base}pat_adm_aki_time_final.csv')

#correct the min time required
from datetime import timedelta

pat_adm['start_time_of_increase'] = pd.to_datetime(pat_adm['start_time_of_increase'])
pat_adm['min_time_required'] = pd.to_datetime(pat_adm['min_time_required'])
twenty_four_hours = timedelta(hours=24)
selected_rows = pat_adm['increase_flag'] == 1

# Subtract 24 hours from 'start_time_of_increase' for selected rows
pat_adm.loc[selected_rows, 'min_time_required'] = pat_adm.loc[selected_rows, 'start_time_of_increase'] - twenty_four_hours

In [15]:

def process_labs(hosp, lab_name):
    # Read the lab data
    lab_df = pd.read_csv(f'{hosp}{lab_name}_lab.csv', index_col=0,on_bad_lines='skip')
    
    # Rename column
    lab_df = lab_df.rename(columns={'valuenum': lab_name})
    
    # Drop unnecessary columns
    lab_df = lab_df.drop(['storetime', 'labevent_id', 'order_provider_id', 'flag', 'priority', 'comments', 'value', 'specimen_id'], axis=1)
    
    # Remove duplicate rows
    lab_df = lab_df.drop_duplicates()

    # Merge with pat_adm and filter labs within admit and dischtime
    pat_bili = pd.merge(pat_adm, lab_df, how='left', on='subject_id')
    try:
        pat_bili['charttime'] = pd.to_datetime(pat_bili['charttime'])
    except ValueError:
    # Remove rows where conversion to datetime fails
        pat_bili = pat_bili[pd.to_datetime(pat_bili['charttime'], errors='coerce').notna()]
    try:
        pat_bili['admittime'] = pd.to_datetime(pat_bili['admittime'])
    except ValueError:
        pat_bili = pat_bili[pd.to_datetime(pat_bili['admittime'], errors='coerce').notna()]
    try:
        pat_bili['dischtime'] = pd.to_datetime(pat_bili['dischtime'])
    except ValueError:
        pat_bili = pat_bili[pd.to_datetime(pat_bili['dischtime'], errors='coerce').notna()]
        
    
    print("Count of patients before the admittime- dischtime filter: ",pat_bili['subject_id'].nunique())
    pat_bili = pat_bili[(pat_bili['charttime'] >= pat_bili['admittime']) & (pat_bili['charttime'] <= pat_bili['dischtime'])]
    print("\nCount of patients after the admittime- dischtime filter: ",pat_bili['subject_id'].nunique())
    
    # Filter for latest labs within min_time_required
    pat_bili['min_time_required'] = pd.to_datetime(pat_bili['min_time_required'])
    df_sorted = pat_bili.sort_values(by=['subject_id', 'charttime'])
    pat_bili = df_sorted.groupby('subject_id').apply(lambda x: x[x['charttime'] <= x['min_time_required']].tail(1))

    # Correct the DataFrame
    pat_bili = pat_bili.drop(['subject_id'], axis=1)
    pat_bili = pat_bili.reset_index()
    pat_bili = pat_bili.drop(['hadm_id_y', 'level_1'], axis=1)
    pat_bili=pat_bili.rename({'charttime':'charttime_' + lab_name},axis=1)
    pat_bili =pat_bili.rename({'valueuom':'valueuom_' + lab_name},axis=1)
    pat_bili =pat_bili.rename({'itemid':'itemid_' + lab_name},axis=1)
    pat_bili=pat_bili[['subject_id','charttime_'+ lab_name,'itemid_'+lab_name,lab_name,'valueuom_'+lab_name]]
    pat_bili = pd.merge(pat_bili, loinc_map, left_on='itemid_'+lab_name,right_on='itemid (omop_source_code)', how='left')
    pat_bili=pat_bili.rename({'omop_concept_id':'loinc_'+lab_name},axis=1)
    pat_bili=pat_bili.drop(['itemid (omop_source_code)'],axis=1)
    #req columns -subject_id,charttime,itemid,albumin,valueuom
    
    return pat_bili

In [5]:
# lab files

#albumin=pd.read_csv(f'{hosp}albumin_lab.csv',index_col=0)
#alt=pd.read_csv(f'{hosp}alt_lab.csv',index_col=0)
#bilirubin = pd.read_csv(f'{hosp}bilirubin_lab.csv',index_col=0)
#bun = pd.read_csv(f'{hosp}bun_lab.csv',index_col=0)
#glucose = pd.read_csv(f'{hosp}glucose_lab.csv',index_col=0)
#ammonia=pd.read_csv(f'{hosp}ammonia_lab.csv',index_col=0)
#ast=pd.read_csv(f'{hosp}ast_lab.csv',index_col=0)

## LOINC codes mapping file
loinc_map=pd.read_csv('d_labitems_to_loinc.csv')
loinc_map=loinc_map[['itemid (omop_source_code)','omop_concept_id']]
#omop_concept_id is the loinc_code

#### Creatinine

In [6]:
#Manually done for creatinine as file name is sc_labs.csv , not lab;
#multiple places contain labs.csv,hence not renaming the file

In [7]:
creatinine=pd.read_csv(f'{hosp}scr_labs.csv',index_col=0)
creatinine=creatinine.rename({'valuenum':'creatinine'},axis=1)
creatinine=creatinine.drop(['storetime','labevent_id','order_provider_id','flag','priority','comments','value','specimen_id'],axis=1)
creatinine=creatinine.drop_duplicates()

In [8]:
#Join with pat_adm and get onlt the labs less than min_time_required; take the earliest one
pat_scr=pd.merge(pat_adm,creatinine,how='left',on='subject_id')
#Keep only those charttimes btn admit and dischtime for a patient
pat_scr['charttime'] = pd.to_datetime(pat_scr['charttime'])
pat_scr['admittime'] = pd.to_datetime(pat_scr['admittime'])
pat_scr['dischtime'] = pd.to_datetime(pat_scr['dischtime'])

pat_scr = pat_scr[(pat_scr['charttime'] >= pat_scr['admittime']) & (pat_scr['charttime'] <= pat_scr['dischtime'])]
#Get only latest labs, less than min req time
# For aki patients, labs would be before onset on aki - min_time_required;
#for non -aki, latest labs within min_time_required(disch time)

pat_scr['charttime'] = pd.to_datetime(pat_scr['charttime'])
pat_scr['min_time_required'] = pd.to_datetime(pat_scr['min_time_required'])
df_sorted = pat_scr.sort_values(by=['subject_id', 'charttime'])
pat_scr = df_sorted.groupby('subject_id').apply(lambda x: x[x['charttime'] <= x['min_time_required']].tail(1))

#Correct the df 
pat_scr=pat_scr.drop(['subject_id'],axis=1)
pat_scr=pat_scr.reset_index()
pat_scr=pat_scr.drop(['hadm_id_y'],axis=1)
pat_scr=pat_scr.drop(['level_1'],axis=1)

pat_scr=pat_scr.rename({'charttime':'charttime_creatinine'},axis=1)
pat_scr =pat_scr.rename({'valueuom':'valueuom_creatinine'},axis=1)
pat_scr =pat_scr.rename({'itemid':'itemid_creatinine'},axis=1)
pat_scr=pat_scr[['subject_id','charttime_creatinine','itemid_creatinine','creatinine','valueuom_creatinine']]

In [9]:
pat_scr.head()

Unnamed: 0,subject_id,charttime_creatinine,itemid_creatinine,creatinine,valueuom_creatinine
0,10094629,2199-05-18 06:08:00,50912,1.2,mg/dL
1,10094902,2136-06-16 13:15:00,50912,0.8,mg/dL
2,10094971,2122-04-19 06:05:00,50912,0.3,mg/dL
3,10095139,2157-10-15 07:00:00,50912,0.8,mg/dL
4,10095417,2175-11-03 06:00:00,50912,0.9,mg/dL


In [10]:
loinc_map.head()

Unnamed: 0,itemid (omop_source_code),omop_concept_id
0,50801,3007913
1,50802,3012501
2,50803,3006576
3,50804,3031147
4,50805,3023081


In [11]:
pat_scr = pd.merge(pat_scr, loinc_map, left_on='itemid_creatinine',right_on='itemid (omop_source_code)', how='left')
pat_scr=pat_scr.rename({'omop_concept_id':'loinc_creatinine'},axis=1)
pat_scr=pat_scr.drop(['itemid (omop_source_code)'],axis=1)

In [12]:
pat_scr.head()

Unnamed: 0,subject_id,charttime_creatinine,itemid_creatinine,creatinine,valueuom_creatinine,loinc_creatinine
0,10094629,2199-05-18 06:08:00,50912,1.2,mg/dL,3016723
1,10094902,2136-06-16 13:15:00,50912,0.8,mg/dL,3016723
2,10094971,2122-04-19 06:05:00,50912,0.3,mg/dL,3016723
3,10095139,2157-10-15 07:00:00,50912,0.8,mg/dL,3016723
4,10095417,2175-11-03 06:00:00,50912,0.9,mg/dL,3016723


In [13]:
del creatinine

#### Albumin

In [16]:
pat_alb=process_labs(hosp,'albumin')
pat_alb['subject_id'].nunique()

Count of patients before the admittime- dischtime filter:  27111

Count of patients after the admittime- dischtime filter:  11305


10081

#### ALT

In [17]:
pat_alt=process_labs(hosp,'alt')
pat_alt['subject_id'].nunique()

Count of patients before the admittime- dischtime filter:  27111

Count of patients after the admittime- dischtime filter:  15689


14530

#### Bilirubin

In [18]:
pat_bilirubin=process_labs(hosp,'bilirubin')
pat_bilirubin['subject_id'].nunique()

Count of patients before the admittime- dischtime filter:  27111

Count of patients after the admittime- dischtime filter:  15354


14173

#### AST

In [19]:
pat_ast=process_labs(hosp,'ast')
pat_ast['subject_id'].nunique()

Count of patients before the admittime- dischtime filter:  27111

Count of patients after the admittime- dischtime filter:  15671


14510

#### Ammonia


In [20]:
pat_ammonia=process_labs(hosp,'ammonia')
pat_ammonia['subject_id'].nunique()

Count of patients before the admittime- dischtime filter:  27111

Count of patients after the admittime- dischtime filter:  464


326

In [21]:
##Testing why its low
ammonia=pd.read_csv(f'{hosp}ammonia_lab.csv',index_col=0)
pat_a=pd.merge(pat_adm,ammonia
               ,how='left',on='subject_id')
#Keep only those charttimes btn admit and dischtime for a patient
pat_a['charttime'] = pd.to_datetime(pat_a['charttime'])
pat_a['admittime'] = pd.to_datetime(pat_a['admittime'])
pat_a['dischtime'] = pd.to_datetime(pat_a['dischtime'])

pat_a = pat_a[(pat_a['charttime'] >= pat_a['admittime']) & (pat_a['charttime'] <= pat_a['dischtime'])]
pat_a['subject_id'].nunique()

464

#### BUN

In [22]:
pat_bun=process_labs(hosp,'bun')
pat_bun['subject_id'].nunique()

  lab_df = pd.read_csv(f'{hosp}{lab_name}_lab.csv', index_col=0,on_bad_lines='skip')


Count of patients before the admittime- dischtime filter:  24494

Count of patients after the admittime- dischtime filter:  24389


23656

#### Glucose 

In [23]:
pat_glucose=process_labs(hosp,'glucose')
pat_glucose['subject_id'].nunique()

Count of patients before the admittime- dischtime filter:  27111

Count of patients after the admittime- dischtime filter:  26918


26104

#### CK

In [25]:
pat_ck=process_labs(hosp,'ck')
pat_ck['subject_id'].nunique()

Count of patients before the admittime- dischtime filter:  27111

Count of patients after the admittime- dischtime filter:  6710


5846

In [None]:
pat_ck.head()

#### Platelets

In [None]:
#pat_platelet=process_labs(hosp,'platelet')
#pat_platelet['subject_id'].nunique()

#### Calcium

In [None]:
#pat_calcium=process_labs(hosp,'calcium')
#pat_calcium['subject_id'].nunique()

In [None]:
##Pending - calcium,ck,platelets,lipase,troponin,wbc

In [26]:
df1=pd.merge(pat_adm,pat_scr,on='subject_id',how='left')
del pat_scr
df2=pd.merge(df1,pat_alb,on='subject_id',how='left')
del df1,pat_alb
df3=pd.merge(df2,pat_alt,on='subject_id',how='left')
del df2,pat_alt
df4=pd.merge(df3,pat_bilirubin,on='subject_id',how='left')
del df3,pat_bilirubin
df5=pd.merge(df4,pat_ast,on='subject_id',how='left')
del df4,pat_ast
df6=pd.merge(df5,pat_bun,on='subject_id',how='left')
del df5,pat_bun
df7=pd.merge(df6,pat_glucose,on='subject_id',how='left')
del df6,pat_glucose
df8=pd.merge(df7,pat_ammonia,on='subject_id',how='left')
del df7,pat_ammonia
df9=pd.merge(df8,pat_ck,on='subject_id',how='left')
del df8,pat_ck

In [None]:
#df10=pd.merge(df9,pat_calcium,on='subject_id',how='left')
#del df9,pat_calcium
#df11=pd.merge(df10,pat_platelet,on='subject_id',how='left')
#del df10,pat_calcium

In [27]:
missing_percentages = round(((df9.isnull().sum() / len(df9)) * 100),2)

# Create a DataFrame to display missing percentages
missing_df = pd.DataFrame({'Missing Percentage': missing_percentages})
missing_df.index.name = 'Column'
missing_df=missing_df.reset_index()
print(missing_df)

                    Column  Missing Percentage
0               subject_id                0.00
1                   gender                0.00
2                  hadm_id                0.00
3                admittime                0.00
4                dischtime                0.00
5           marital_status                3.77
6                     race                0.00
7                      age                0.00
8      earliest_creatinine                0.00
9                     egfr                0.00
10           increase_flag                0.00
11  start_time_of_increase               78.65
12       min_time_required                0.00
13    charttime_creatinine                2.94
14       itemid_creatinine                2.94
15              creatinine                2.96
16     valueuom_creatinine                2.94
17        loinc_creatinine                2.94
18       charttime_albumin               62.82
19          itemid_albumin               62.82
20           

In [28]:
#Save the dfs:
df9.to_csv(f'{base}labs_final.csv')

missing_df.to_csv(f'{base}missing_lab_percent.csv')
del missing_df