In [None]:
#Step 19 reconstructs a cohort for BIDMC to estimate the error rate of the program. 

In [1]:
#Install the necessary modules. 
import numpy as np
import pandas as pd
from tqdm import tqdm
from thunderpack import ThunderReader  
import gc
import re

In [2]:
#Read the BIDMC file
df=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/complete_df_initial.csv")

In [3]:
#Randomly shuffle the BIDMC data. 
random = df.sample(frac=1, random_state=2024, ignore_index=True)  # shuffling
unique = random.drop_duplicates(subset='BDSPPatientID', keep='first', ignore_index=True)
random_unique = unique.sample(n=10000, random_state=2024, ignore_index=True)
ids = list(random_unique['BDSPPatientID'])

In [4]:
#Obtain icd codes from the BIDMC thunderpack data. 
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')
key_length = len(list(reader.keys()))
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['BDSPPatientID'].isin(ids)]
    df = df.drop(columns=['BDSPEncounterID', 'DiagnosisSequenceNumber', 'DiagnosisPoaInd', 
                                        'ShortDescription', 'LongDescription', 'DiagnosisType', 
                                        'BDSPLastModifiedDTS', 'code_type', 'DischargeDate'])
    df['AdmissionDate'] = pd.to_datetime(df['AdmissionDate'])
    dfs.append(df)
icd_df = pd.concat(dfs, axis=0, ignore_index=True)

100%|██████████| 36/36 [00:37<00:00,  1.04s/it]


In [5]:
#Head data to verify it worked. 
icd_df.head()

Unnamed: 0,BDSPPatientID,DiagnosisCode,DiagnosisCodeWithDots,AdmissionDate
0,150071046,41401,414.01,2011-06-15
1,150071046,42983,429.83,2011-06-15
2,150071046,5990,599.0,2011-06-15
3,150071046,25000,250.0,2011-06-15
4,150071046,2724,272.4,2011-06-15


In [6]:
#Random head data to ensure it worked. 
random_unique.head()

Unnamed: 0,BDSPPatientID,NoteTypeFull,Service,CreateDate,DeidentifiedName
0,151310180,Initial note,Cardiology,2023-05-26,Notes_1131168764_3416348191_20230526.txt
1,151188065,Initial note,Pain Management,2014-04-21,Notes_1131046805_426282968_20140421.txt
2,150020366,Initial note,Patient Financial Serives,2023-05-24,Notes_1129879243_3732266378_20230524.txt
3,150622094,Initial note,Nephrology,2020-02-15,Notes_1130480893_1099015134_20200215.txt
4,150954407,Initial note,Gastroenterology,2016-08-01,Notes_1130813198_2132813469_20160801.txt


In [21]:
#Detect SDH occurences with ICD+ code assigned within 30 days of visit. 
pattern = re.compile(r'^(?:I62.0|S06.5|432.1|852.2|852.3)')

def check_icd_in_period(id_, date):
    #Convert date to pandas Timestamp
    if isinstance(date, str):
        date = pd.to_datetime(date)
        
    #Define the date range (one month leading up to note)
    start_date = date - pd.DateOffset(days=30)
    end_date = date + pd.DateOffset(days=30)
    
    #Ensure AdmissionDate is a pandas Timestamp
    if not pd.api.types.is_datetime64_any_dtype(icd_df['AdmissionDate']):
        icd_df['AdmissionDate'] = pd.to_datetime(icd_df['AdmissionDate'])
    
    #Filter icd_df for matching ID and date range
    mask = (icd_df['BDSPPatientID'] == id_) & (icd_df['AdmissionDate'] > start_date) & (icd_df['AdmissionDate'] < end_date)
    filtered_df = icd_df.loc[mask]
    
    #Check for regex pattern match in 'DiagnosisCodeWithDots' column
    if filtered_df['DiagnosisCodeWithDots'].str.contains(pattern).any():
        return 1
    return 0

#Apply the function to each row of random_unique
random_unique['ICD'] = random_unique.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['CreateDate']), axis=1)

In [22]:
#Print cohort reconstruction data. 
print(f'Total random patients: {len(random_unique)}')
print(f'Total +ICD: {sum(random_unique["ICD"])}')
print(f'Total -ICD: {len(random_unique[random_unique["ICD"] == 0])}')
print(f'Prevalence: {sum(random_unique["ICD"]) / len(random_unique)}')

prev_ICD_p = sum(random_unique["ICD"]) / len(random_unique)
prev_ICD_n = sum(1-random_unique["ICD"]) / len(random_unique)


Total random patients: 10000
Total +ICD: 92
Total -ICD: 9908
Prevalence: 0.0092


In [34]:
#Create a csv. 
random_unique.to_csv('BI_random_unique.csv', index=False)

In [30]:
#Load the testing feature matrix
df_test = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/Complete_merged_feature_matrix_notes_CPT_and_ICD_.csv')

#Load the model prediction for the testing feature matrix
df_pred = pd.read_csv('BI_y_and_y_pred.csv') 
df_pred = df_test.merge(df_pred, on='BDSPPatientID', how='inner', validate='1:1')
df_icd_p = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/bidmc_pos_icd.csv')
df_pred['Group'] = np.in1d(df_pred.BDSPPatientID, df_icd_p.BDSPPatientID).astype(int)

In [31]:
#Print the number of rows. 
print((df_pred.Group==1).sum())
print((df_pred.Group==0).sum())

750
750


In [33]:
#Create a csv. 
df_pred.to_csv('BI_df_pred.csv', index=False)

In [32]:
#Get the error rate per group
error_rater_ICD_p = 1 - np.mean( df_pred.y[df_pred.Group==1] == df_pred.y_pred[df_pred.Group==1] )
error_rater_ICD_n = 1 - np.mean( df_pred.y[df_pred.Group==0] == df_pred.y_pred[df_pred.Group==0] )
final_error_rate = error_rater_ICD_p*prev_ICD_p + error_rater_ICD_n*prev_ICD_n
print(final_error_rate)

0.003557333333333334
