In [None]:
#Optional step 2 creates the samping cohort for positive ICD codes for Mass General Hospital (MGB)
#Reminder: ICD+ group is defined as patients who received at least one SDH-related ICD.

In [2]:
#Install all necessary modules to make code run. # Install thunderpack if not yet: "pip install thunderpack".
import numpy as np 
import pandas as pd
from tqdm import tqdm
from thunderpack import ThunderReader  
import gc


In [3]:
#Read the thunderpack file
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB') 
dfs = []

#Creating a loop that goes over all the keys to search for whether or not it contains the following ICD+ codes.
#Concatenate all filtered dataframes from all partitions into one dataframe.
for i in tqdm(range(1, 511)):    
    df = reader[f'ICD_partition_{i}']
    df = df[df.ICDCD.astype(str).str.contains('^(?:I62.0|S06.5|432.1|852.2|852.3)')]  
    dfs.append(df)
df_icd_plus = pd.concat(dfs, axis=0, ignore_index=True) 


#Keep the unique patient IDs.
df_icd_plus = df_icd_plus[['BDSPPatientID','ShiftedContactDTS','BDSPEncounterID', 'ICDCD']].drop_duplicates()
df_icd_plus.to_csv('patientIDs_ICD_plus_SDH_MGB.csv', index=False)   

In [4]:

import pandas as pd
import numpy as np
import gc
from tqdm import tqdm

#Read the thunderpack file. 
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB')
#The above code ^^^ is optional, only use if you decide to run these cells separately for memory purposes. 

#Load in the positive ICD group for MGB.
df_icd_plus = pd.read_csv('patientIDs_ICD_plus_SDH_MGB.csv')

#Reminder: ICD minus group is defined as patients who never have received any SDH-related ICDs
dfs = []

#Creating a loop that goes over all the keys to search for whether it contains the following ICD+ codes so it can exclude them.
for i in tqdm(range(1, 250)):    
    df = reader[f'ICD_partition_{i}']
    df = df[~df.ICDCD.astype(str).str.contains('^(?:I62.0|S06.5|432.1|852.2|852.3)')]  
    #Excluding 'ICDCD' here to save space, as it is not relevant to program.
    df = df[['BDSPPatientID','ShiftedContactDTS','BDSPEncounterID']]   
    dfs.append(df)
    #To further save space, delete the df.
    del df
    gc.collect()

#Concatenate all filtered dataframes from all partitions into one dataframe.
df_icd_minus = pd.concat(dfs, axis=0, ignore_index=True)  
df_icd_minus = df_icd_minus[['BDSPPatientID','ShiftedContactDTS','BDSPEncounterID']].drop_duplicates()

#Conceptually the following code is: df_icd_minus = df_icd_minus - df_icd_plus.
df_icd_minus = df_icd_minus[~np.in1d(df_icd_minus.BDSPPatientID, df_icd_plus.BDSPPatientID)]

#Save as a CSV file without the ICD codes.
df_icd_minus.to_csv('patientIDs_ICD_minus2_SDH_MGB.csv', index=False)

100%|██████████| 249/249 [12:42<00:00,  3.06s/it]


In [5]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm

#Read the thunderpack file. 
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB')
#The above code ^^^ is optional, only use if you decide to run these cells separately for memory purposes. 

#Load in the positive ICD group for MGB.
df_icd_plus = pd.read_csv('patientIDs_ICD_plus_SDH_MGB.csv')

#Reminder: ICD minus group is defined as patients who never have received any SDH-related ICDs
dfs = []

#Creating a loop that goes over all the keys to search for whether it contains the following ICD+ codes so it can exclude them.
for i in tqdm(range(250, 511)):    
    df = reader[f'ICD_partition_{i}']
    df = df[~df.ICDCD.astype(str).str.contains('^(?:I62.0|S06.5|432.1|852.2|852.3)')]  
    # Excluding 'ICDCD' here to save space, as it is not relevant to program.
    df = df[['BDSPPatientID','ShiftedContactDTS','BDSPEncounterID']]   
    dfs.append(df)

    #To further save space, delete the df.
    del df
    gc.collect()

#Concatenate all filtered dataframes from all partitions into one dataframe.
df_icd_minus = pd.concat(dfs, axis=0, ignore_index=True)  
df_icd_minus = df_icd_minus[['BDSPPatientID','ShiftedContactDTS','BDSPEncounterID']].drop_duplicates()

#Conceptually the following code is: df_icd_minus = df_icd_minus - df_icd_plus.
df_icd_minus = df_icd_minus[~np.in1d(df_icd_minus.BDSPPatientID, df_icd_plus.BDSPPatientID)]

#Save as a CSV file without the ICD codes.
df_icd_minus.to_csv('2patientIDs_ICD_minus2_SDH_MGB.csv', index=False)

100%|██████████| 261/261 [13:49<00:00,  3.18s/it]


In [6]:
#Read in the two csvs as dataframes. 
df1 = pd.read_csv('patientIDs_ICD_minus2_SDH_MGB.csv')
df2 = pd.read_csv('2patientIDs_ICD_minus2_SDH_MGB.csv')

#Concatenate the dataframes.
combined_df = pd.concat([df1, df2], axis=0, ignore_index=True)

#Remove duplicate rows.
combined_df = combined_df.drop_duplicates()

#Save the combined dataframe to a new CSV file
combined_df.to_csv('patientIDs_ICD_minus2_SDH_MGB.csv', index=False)