In [1]:
##Installing necessary modules to run the code. Make sure to install thunderpack if not yet installed: pip install thunderpack
import numpy as np
import pandas as pd
from tqdm import tqdm
from thunderpack import ThunderReader  
import gc

In [2]:
##We are now creating the sampling cohort for positive ICD codes for BIDMC.
#reminder: ICD+ group is defined as patients who received at least one SDH-related ICD

#read the thunderpack file
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')

dfs = []

#Creating a for loop that goes over all the keys to search for whether or not it contains the following ICD+ codes.
#Concatenate all filtered dataframes from all partitions into one dataframe
for i in tqdm(range(1, (len(list(reader.keys()))+1))):  
    df = reader[f'ICD_partition_{i}']
    df = df[df.DiagnosisCodeWithDots.astype(str).str.contains('^(?:I62.0|S06.5|432.1|852.2|852.3)')]  
    dfs.append(df)
df_icd_plus_BI = pd.concat(dfs, axis=0, ignore_index=True)  

# keep unique patient IDs and save as CSV file
df_icd_plus_BI = df_icd_plus_BI[['BDSPPatientID','AdmissionDate','BDSPEncounterID', 'DiagnosisCodeWithDots']].drop_duplicates()
df_icd_plus_BI.to_csv('patientIDs_ICD_plus_SDH_BI.csv', index=False)    


100%|██████████| 36/36 [00:46<00:00,  1.29s/it]


In [3]:
#Loading in the positive ICD group for BIDMC
df_icd_plus_BI = pd.read_csv('patientIDs_ICD_plus_SDH_BI.csv') 

# reminder: ICD minus group is defined as patients who never have received any SDH-related ICDs
dfs = []

#Creating a for loop that goes over all the keys to search for whether it contains the following ICD+ codes so it can leave them out. 
for i in tqdm(range(1, (len(list(reader.keys()))+1))):   
    df = reader[f'ICD_partition_{i}']
    df = df[~df.DiagnosisCodeWithDots.astype(str).str.contains('^(?:I62.0|S06.5|432.1|852.2|852.3)')]  
    df = df[['BDSPPatientID','AdmissionDate','BDSPEncounterID','DiagnosisCodeWithDots']]
    dfs.append(df)

    # to further save space, delete df
    del df
    gc.collect()

# Concatenate all filtered dataframes from all partitions into one dataframe
df_icd_minus_BI = pd.concat(dfs, axis=0, ignore_index=True)  
df_icd_minus_BI = df_icd_minus_BI[['BDSPPatientID','AdmissionDate','BDSPEncounterID']].drop_duplicates()

# because we should make sure these are patients who never got any SDH ICDs
# therefore we should remove patients who are included in the df_icd_plus
# conceptually: df_icd_minus = df_icd_minus - df_icd_plus

df_icd_minus_BI = df_icd_minus_BI[ ~np.in1d(df_icd_minus_BI.BDSPPatientID, df_icd_plus_BI.BDSPPatientID) ]

#Save as a CSV file
df_icd_minus_BI.to_csv('patientIDs_ICD_minus2_SDH_BI.csv', index=False)   



100%|██████████| 36/36 [01:07<00:00,  1.89s/it]
