In [1]:
##Installing necessary modules to run the code. Make sure to install thunderpack if not yet installed: pip install thunderpack
import numpy as np
import pandas as pd
from tqdm import tqdm
from thunderpack import ThunderReader  
import gc
import re

In [2]:
##We are now going to pull the BDSPPatientID, Admission date, and DiagnosisCodeWithDots for each patient.
##We then need to randomly select 10,000 of them
###Then we need to iterate for ICD code and make sure that the ICD code is within a cetain time period of them being seen. 
#Then we need to pull out the number of patients who have a positive ICD code. 
#Then we need to estimate the overall error rate

#Start with BIDMC first

df=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/complete_df_initial.csv")

In [3]:
random = df.sample(frac=1, random_state=2024, ignore_index=True)  # shuffling
unique = random.drop_duplicates(subset='BDSPPatientID', keep='first', ignore_index=True)
random_unique = unique.sample(n=10000, random_state=2024, ignore_index=True)
ids = list(random_unique['BDSPPatientID'])



In [4]:
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')
key_length = len(list(reader.keys()))
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['BDSPPatientID'].isin(ids)]
    df = df.drop(columns=['BDSPEncounterID', 'DiagnosisSequenceNumber', 'DiagnosisPoaInd', 
                                        'ShortDescription', 'LongDescription', 'DiagnosisType', 
                                        'BDSPLastModifiedDTS', 'code_type', 'DischargeDate'])
    df['AdmissionDate'] = pd.to_datetime(df['AdmissionDate'])
    dfs.append(df)
icd_df = pd.concat(dfs, axis=0, ignore_index=True)

100%|██████████| 36/36 [00:37<00:00,  1.04s/it]


In [5]:
icd_df.head()

Unnamed: 0,BDSPPatientID,DiagnosisCode,DiagnosisCodeWithDots,AdmissionDate
0,150071046,41401,414.01,2011-06-15
1,150071046,42983,429.83,2011-06-15
2,150071046,5990,599.0,2011-06-15
3,150071046,25000,250.0,2011-06-15
4,150071046,2724,272.4,2011-06-15


In [6]:
random_unique.head()

Unnamed: 0,BDSPPatientID,NoteTypeFull,Service,CreateDate,DeidentifiedName
0,151310180,Initial note,Cardiology,2023-05-26,Notes_1131168764_3416348191_20230526.txt
1,151188065,Initial note,Pain Management,2014-04-21,Notes_1131046805_426282968_20140421.txt
2,150020366,Initial note,Patient Financial Serives,2023-05-24,Notes_1129879243_3732266378_20230524.txt
3,150622094,Initial note,Nephrology,2020-02-15,Notes_1130480893_1099015134_20200215.txt
4,150954407,Initial note,Gastroenterology,2016-08-01,Notes_1130813198_2132813469_20160801.txt


In [21]:
import pandas as pd
import re

# Updated pattern
pattern = re.compile(r'^(?:I62.0|S06.5|432.1|852.2|852.3)')

def check_icd_in_period(id_, date):
    # Convert date to pandas Timestamp if it's not already
    if isinstance(date, str):
        date = pd.to_datetime(date)
        
    # Define the date range (one month leading up to note)
    start_date = date - pd.DateOffset(days=30)
    end_date = date + pd.DateOffset(days=30)
    
    # Ensure AdmissionDate is a pandas Timestamp
    if not pd.api.types.is_datetime64_any_dtype(icd_df['AdmissionDate']):
        icd_df['AdmissionDate'] = pd.to_datetime(icd_df['AdmissionDate'])
    
    # Filter icd_df for matching ID and date range
    mask = (icd_df['BDSPPatientID'] == id_) & (icd_df['AdmissionDate'] > start_date) & (icd_df['AdmissionDate'] < end_date)
    filtered_df = icd_df.loc[mask]
    
    # Check for regex pattern match in 'DiagnosisCodeWithDots' column
    if filtered_df['DiagnosisCodeWithDots'].str.contains(pattern).any():
        return 1
    return 0

# Apply the function to each row of random_unique
random_unique['ICD'] = random_unique.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['CreateDate']), axis=1)


In [22]:
print(f'Total random patients: {len(random_unique)}')
print(f'Total +ICD: {sum(random_unique["ICD"])}')
print(f'Total -ICD: {len(random_unique[random_unique["ICD"] == 0])}')
print(f'Prevalence: {sum(random_unique["ICD"]) / len(random_unique)}')

prev_ICD_p = sum(random_unique["ICD"]) / len(random_unique)
prev_ICD_n = sum(1-random_unique["ICD"]) / len(random_unique)


Total random patients: 10000
Total +ICD: 92
Total -ICD: 9908
Prevalence: 0.0092


In [34]:
random_unique.to_csv('BI_random_unique.csv', index=False)

In [30]:
# get group variable for the testing feature matrix, so that we can get error rate (1-accuracy) in each group
#Not really sure what to do here. 

# load the testing feature matrix
df_test = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/Complete_merged_feature_matrix_notes_CPT_and_ICD_.csv')

# load the model prediction for the testing feature matrix
df_pred = pd.read_csv('BI_y_and_y_pred.csv') ### Hi Greg, do you know where is file is? We generated it in (maybe) step14, but it's not there anymore. -- Haoqi
df_pred = df_test.merge(df_pred, on='BDSPPatientID', how='inner', validate='1:1')

# get the group

# this is an old method to get ICD+/ICD- group, based on searching ICD codes
#df_test['Group'] = df_test.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['ContactDate']), axis=1)

# however, there was some problem with this method
# so we decided to use a list of patient IDs (BDSPPatientID) during sampling cohort construction
df_icd_p = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/bidmc_pos_icd.csv')

df_pred['Group'] = np.in1d(df_pred.BDSPPatientID, df_icd_p.BDSPPatientID).astype(int)

In [31]:
print((df_pred.Group==1).sum())
print((df_pred.Group==0).sum())

750
750


In [33]:
df_pred.to_csv('BI_df_pred.csv', index=False)

In [32]:
# get error rate per group

### removed old code
#y_pred = df_pred.y_pred
#error_rater_ICD_p = 1 - np.mean( df_test.annotation[df_test.Group==1] == y_pred[df_test.Group==1] )
#error_rater_ICD_n = 1 - np.mean( df_test.annotation[df_test.Group==0] == y_pred[df_test.Group==0] )
### this is the new code
error_rater_ICD_p = 1 - np.mean( df_pred.y[df_pred.Group==1] == df_pred.y_pred[df_pred.Group==1] )
error_rater_ICD_n = 1 - np.mean( df_pred.y[df_pred.Group==0] == df_pred.y_pred[df_pred.Group==0] )

# get the final error rate!!
final_error_rate = error_rater_ICD_p*prev_ICD_p + error_rater_ICD_n*prev_ICD_n

print(final_error_rate)



0.003557333333333334


In [None]:
###Need to put this somewhere in other code: 
# save model prediction

df_pred = pd.DataFrame(data={
    'BDSPPatientID':matrix.BDSPPatientID,
    'y':y_holdout,
    'y_pred':y_pred,
    'y_pred_proba':y_pred_proba,
})
print(df_pred)
df_pred.to_csv('test_both_hospitals_y_and_y_pred.csv', index=False)



















In [9]:

# # Sample 10,000 random rows with a specific random state for reproducibility
# sampled_df = complete_df_initial.sample(n=10000, random_state=101)

# # Find duplicated BDSPPatientID rows
# duplicates_df = sampled_df[sampled_df.duplicated(subset='BDSPPatientID', keep=False)]

# # Identify all duplicate IDs
# duplicate_ids = duplicates_df['BDSPPatientID'].unique()

# # Randomly select which duplicates to keep
# def keep_random_duplicates(df, duplicate_ids):
#     # For each duplicate ID, randomly select one occurrence to keep
#     df_unique = pd.DataFrame()
#     for patient_id in duplicate_ids:
#         df_id = df[df['BDSPPatientID'] == patient_id]
#         df_unique = pd.concat([df_unique, df_id.sample(n=1, random_state=102)])  # Randomly keep one instance
    
#     return df_unique

# # Get one instance of each duplicate BDSPPatientID
# unique_duplicates_df = keep_random_duplicates(sampled_df, duplicate_ids)

# # Filter out the duplicates from the sampled_df
# non_duplicate_df = sampled_df[~sampled_df['BDSPPatientID'].isin(duplicate_ids)]

# # Combine the kept duplicates with non-duplicates
# cleaned_sampled_df = pd.concat([non_duplicate_df, unique_duplicates_df], ignore_index=True)

# # Check how many more rows are needed to reach 10,000 unique BDSPPatientID
# current_unique_count = cleaned_sampled_df['BDSPPatientID'].nunique()
# additional_rows_needed = 10000 - current_unique_count

# if additional_rows_needed > 0:
#     # Identify non-sampled patients
#     remaining_patients = complete_df_initial[~complete_df_initial['BDSPPatientID'].isin(cleaned_sampled_df['BDSPPatientID'])]
    
#     # Sample additional rows to reach the desired count
#     additional_df = remaining_patients.sample(n=additional_rows_needed, random_state=103)
    
#     # Combine cleaned sample with additional rows
#     final_df = pd.concat([cleaned_sampled_df, additional_df], ignore_index=True)
# else:
#     final_df = cleaned_sampled_df

# # Ensure final_df contains 10,000 unique BDSPPatientID values
# final_df = final_df.drop_duplicates(subset='BDSPPatientID', keep='first')
# final_df = final_df.reset_index(drop=True)

# # Check for and remove any additional duplicates, ensuring exactly 10,000 rows
# while final_df['BDSPPatientID'].nunique() < 10000:
#     current_count = final_df['BDSPPatientID'].nunique()
#     additional_needed = 10000 - current_count
    
#     remaining_patients = complete_df_initial[~complete_df_initial['BDSPPatientID'].isin(final_df['BDSPPatientID'])]
#     additional_df = remaining_patients.sample(n=additional_needed, random_state=104)
    
#     final_df = pd.concat([final_df, additional_df], ignore_index=True)
#     final_df = final_df.drop_duplicates(subset='BDSPPatientID', keep='first').reset_index(drop=True)

# # Save the final DataFrame to a new CSV file
# final_df.to_csv("/home/gregory178/Desktop/NAX project/NAX_SDH/10000_rows_BI.csv", index=False)

# print("Final DataFrame with 10,000 unique BDSPPatientID rows saved successfully.")

