In [1]:
##Installing necessary modules to run the code. Make sure to install thunderpack if not yet installed: pip install thunderpack
import numpy as np
import pandas as pd
from tqdm import tqdm
from thunderpack import ThunderReader  
import gc
import re

In [2]:
##We are now going to pull the BDSPPatientID, Admission date, and DiagnosisCodeWithDots for each patient.
##We then need to randomly select 10,000 of them
###Then we need to iterate for ICD code and make sure that the ICD code is within a cetain time period of them being seen. 
#Then we need to pull out the number of patients who have a positive ICD code. 
#Then we need to estimate the overall error rate

#Start with MGB 

df=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/complete_df_discharge.csv")

In [3]:
random = df.sample(frac=1, random_state=2024, ignore_index=True)
unique = random.drop_duplicates(subset='BDSPPatientID', keep='first', ignore_index=True)
random_unique = unique.sample(n=10000, random_state=2024, ignore_index=True)
ids = list(random_unique['BDSPPatientID'])



In [4]:
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB')
key_length = len(list(reader.keys()))
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['BDSPPatientID'].isin(ids)]
    df  = df.drop(columns=['EncounterLineNBR', 'BDSPEncounterID', 'code_type', 'BDSPLastModifiedDTS', 
                                        'DiagnosisLinkedProblemID', 'DiagnosisChronicFLG', 'PrimaryDiagnosisFLG', 
                                        'DiagnosisDSC', 'DiagnosisNM', 'ICDDSC', 'ICDLineNBR', 'ShiftedUpdateDTS'])
    df['ShiftedContactDTS'] = pd.to_datetime(df['ShiftedContactDTS'])
    dfs.append(df)
icd_df = pd.concat(dfs, axis=0, ignore_index=True)

100%|██████████| 511/511 [23:52<00:00,  2.80s/it]


In [5]:
icd_df.head()

Unnamed: 0,BDSPPatientID,ShiftedContactDTS,ICDCD
0,113778980.0,2018-03-04,728.85
1,111737569.0,2021-06-17,719.45
2,114199398.0,2022-09-19,719.45
3,116668668.0,2021-04-01,411.1
4,111640020.0,2021-03-29,296.32


In [6]:
random_unique.head()

Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
0,116622596,2019-01-30,Discharge Summary,Notes_13492329505_2726548681_20190130.txt
1,121898365,2019-08-13,Discharge Summary,Notes_13458257346_2404061611_20190813.txt
2,120766189,2021-05-20,Discharge Summary,Notes_13615779497_5860502858_20210520.txt
3,113679546,2022-04-03,Discharge Summary,Notes_13533032694_5821209330_20220403.txt
4,120497157,2022-09-21,Discharge Summary,Notes_13695542357_8521726987_20220921.txt


In [7]:
import pandas as pd
import re

# Updated pattern
pattern = re.compile(r'^(?:I62.0|S06.5|432.1|852.2|852.3)')

def check_icd_in_period(id_, date):
    # Convert date to pandas Timestamp if it's not already
    if isinstance(date, str):
        date = pd.to_datetime(date)
        
       # Define the date range (one month leading up to note)
    start_date = date - pd.DateOffset(days=30)
    end_date = date + pd.DateOffset(days=30)
    
    # Ensure ShiftedContactDTS is a pandas Timestamp
    if not pd.api.types.is_datetime64_any_dtype(icd_df['ShiftedContactDTS']):
        icd_df['ShiftedContactDTS'] = pd.to_datetime(icd_df['ShiftedContactDTS'])
    
    # Filter icd_df for matching ID and date range
    mask = (icd_df['BDSPPatientID'] == id_) & (icd_df['ShiftedContactDTS'] > start_date) & (icd_df['ShiftedContactDTS'] < end_date)
    filtered_df = icd_df.loc[mask]
    
    
    # Check for regex pattern match in 'ICDCD' column
    if filtered_df['ICDCD'].str.match(pattern).any():
        return 1
    return 0

# Apply the function to each row of random_unique
random_unique['ICD'] = random_unique.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['ContactDate']), axis=1)


In [8]:
print(f'Total random patients: {len(random_unique)}')
print(f'Total +ICD: {sum(random_unique["ICD"])}')
print(f'Total -ICD: {len(random_unique[random_unique["ICD"] == 0])}')
print(f'Prevalence: {sum(random_unique["ICD"]) / len(random_unique)}')

prev_ICD_p = sum(random_unique["ICD"]) / len(random_unique)
prev_ICD_n = sum(1-random_unique["ICD"]) / len(random_unique)


Total random patients: 10000
Total +ICD: 237
Total -ICD: 9763
Prevalence: 0.0237


In [16]:
random_unique.to_csv('MGB_random_unique.csv', index=False)

In [13]:
# get group variable for the testing feature matrix, so that we can get error rate (1-accuracy) in each group
#Not really sure what to do here. 

# load the testing feature matrix
df_test = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/Complete_merged_feature_matrix_notes_CPT_and_ICD_.csv')

# load the model prediction for the testing feature matrix
df_pred = pd.read_csv('MGB_y_and_y_pred.csv') ### Hi Greg, do you know where is file is? We generated it in (maybe) step14, but it's not there anymore. -- Haoqi
df_pred = df_test.merge(df_pred, on='BDSPPatientID', how='inner', validate='1:1')

# get the group

# this is an old method to get ICD+/ICD- group, based on searching ICD codes
#df_test['Group'] = df_test.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['ContactDate']), axis=1)

# however, there was some problem with this method
# so we decided to use a list of patient IDs (BDSPPatientID) during sampling cohort construction
df_icd_p = pd.read_csv('/home/gregory178/Desktop/NAX project/NAX_SDH/mgb_pos_icd.csv')

df_pred['Group'] = np.in1d(df_pred.BDSPPatientID, df_icd_p.BDSPPatientID).astype(int)


In [14]:
print((df_pred.Group==1).sum())
print((df_pred.Group==0).sum())

750
749


In [17]:
df_pred.to_csv('MGB_df_pred.csv', index=False)

In [15]:
# get error rate per group

### removed old code
#y_pred = df_pred.y_pred
#error_rater_ICD_p = 1 - np.mean( df_test.annotation[df_test.Group==1] == y_pred[df_test.Group==1] )
#error_rater_ICD_n = 1 - np.mean( df_test.annotation[df_test.Group==0] == y_pred[df_test.Group==0] )
### this is the new code
error_rater_ICD_p = 1 - np.mean( df_pred.y[df_pred.Group==1] == df_pred.y_pred[df_pred.Group==1] )
error_rater_ICD_n = 1 - np.mean( df_pred.y[df_pred.Group==0] == df_pred.y_pred[df_pred.Group==0] )

# get the final error rate!!
final_error_rate = error_rater_ICD_p*prev_ICD_p + error_rater_ICD_n*prev_ICD_n

print(final_error_rate)



0.007196813885180288


In [10]:
# Total random patients: 10000
# Total +ICD: 237
# Total -ICD: 9763
# Prevalence: 0.0237
