In [1]:
##Installing necessary modules to run the code. Make sure to install thunderpack if not yet installed: pip install thunderpack
import numpy as np
import pandas as pd
from tqdm import tqdm
from thunderpack import ThunderReader  
import gc
import re

In [2]:
##We are now going to pull the BDSPPatientID, Admission date, and DiagnosisCodeWithDots for each patient.
##We then need to randomly select 10,000 of them
###Then we need to iterate for ICD code and make sure that the ICD code is within a cetain time period of them being seen. 
#Then we need to pull out the number of patients who have a positive ICD code. 
#Then we need to estimate the overall error rate

#Start with MGB 

df=pd.read_csv("/home/gregory178/Desktop/NAX project/NAX_dive/complete_df_discharge.csv")

In [3]:
random = df.sample(frac=1, random_state=2024, ignore_index=True)
unique = random.drop_duplicates(subset='BDSPPatientID', keep='first', ignore_index=True)
random_unique = unique.sample(n=10000, random_state=2024, ignore_index=True)
ids = list(random_unique['BDSPPatientID'])



In [4]:
reader = ThunderReader('/media/gregory178/Thunderpacks/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB')
key_length = len(list(reader.keys()))
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['BDSPPatientID'].isin(ids)]
    df  = df.drop(columns=['EncounterLineNBR', 'BDSPEncounterID', 'code_type', 'BDSPLastModifiedDTS', 
                                        'DiagnosisLinkedProblemID', 'DiagnosisChronicFLG', 'PrimaryDiagnosisFLG', 
                                        'DiagnosisDSC', 'DiagnosisNM', 'ICDDSC', 'ICDLineNBR', 'ShiftedUpdateDTS'])
    df['ShiftedContactDTS'] = pd.to_datetime(df['ShiftedContactDTS'])
    dfs.append(df)
icd_df = pd.concat(dfs, axis=0, ignore_index=True)

100%|██████████| 511/511 [22:34<00:00,  2.65s/it]


In [5]:
icd_df.head()

Unnamed: 0,BDSPPatientID,ShiftedContactDTS,ICDCD
0,113778980.0,2018-03-04,728.85
1,111737569.0,2021-06-17,719.45
2,114199398.0,2022-09-19,719.45
3,116668668.0,2021-04-01,411.1
4,111640020.0,2021-03-29,296.32


In [6]:
random_unique.head()

Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
0,116622596,2019-01-30,Discharge Summary,Notes_13492329505_2726548681_20190130.txt
1,121898365,2019-08-13,Discharge Summary,Notes_13458257346_2404061611_20190813.txt
2,120766189,2021-05-20,Discharge Summary,Notes_13615779497_5860502858_20210520.txt
3,113679546,2022-04-03,Discharge Summary,Notes_13533032694_5821209330_20220403.txt
4,120497157,2022-09-21,Discharge Summary,Notes_13695542357_8521726987_20220921.txt


In [8]:
import pandas as pd
import re

# Updated pattern
pattern = re.compile(r'^(?:I62.0|S06.5|432.1|852.2|852.3)')

def check_icd_in_period(id_, date):
    # Convert date to pandas Timestamp if it's not already
    if isinstance(date, str):
        date = pd.to_datetime(date)
        
       # Define the date range (one month leading up to note)
    start_date = date - pd.DateOffset(days=30)
    end_date = date + pd.DateOffset(days=30)
    
    # Ensure ShiftedContactDTS is a pandas Timestamp
    if not pd.api.types.is_datetime64_any_dtype(icd_df['ShiftedContactDTS']):
        icd_df['ShiftedContactDTS'] = pd.to_datetime(icd_df['ShiftedContactDTS'])
    
    # Filter icd_df for matching ID and date range
    mask = (icd_df['BDSPPatientID'] == id_) & (icd_df['ShiftedContactDTS'] > start_date) & (icd_df['ShiftedContactDTS'] < end_date)
    filtered_df = icd_df.loc[mask]
    
    
    # Check for regex pattern match in 'ICDCD' column
    if filtered_df['ICDCD'].str.match(pattern).any():
        return 1
    return 0

# Apply the function to each row of random_unique
random_unique['ICD'] = random_unique.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['ContactDate']), axis=1)


In [11]:
print(f'Total random patients: {len(random_unique)}')
print(f'Total +ICD: {sum(random_unique["ICD"])}')
print(f'Total -ICD: {len(random_unique[random_unique["ICD"] == 0])}')
print(f'Prevalence: {sum(random_unique["ICD"]) / len(random_unique)}')


Total random patients: 10000
Total +ICD: 214
Total -ICD: 9786
Prevalence: 0.0214


In [None]:

# # Sample 10,000 random rows with a specific random state for reproducibility
# sampled_df = complete_df_initial.sample(n=10000, random_state=101)

# # Find duplicated BDSPPatientID rows
# duplicates_df = sampled_df[sampled_df.duplicated(subset='BDSPPatientID', keep=False)]

# # Identify all duplicate IDs
# duplicate_ids = duplicates_df['BDSPPatientID'].unique()

# # Randomly select which duplicates to keep
# def keep_random_duplicates(df, duplicate_ids):
#     # For each duplicate ID, randomly select one occurrence to keep
#     df_unique = pd.DataFrame()
#     for patient_id in duplicate_ids:
#         df_id = df[df['BDSPPatientID'] == patient_id]
#         df_unique = pd.concat([df_unique, df_id.sample(n=1, random_state=102)])  # Randomly keep one instance
    
#     return df_unique

# # Get one instance of each duplicate BDSPPatientID
# unique_duplicates_df = keep_random_duplicates(sampled_df, duplicate_ids)

# # Filter out the duplicates from the sampled_df
# non_duplicate_df = sampled_df[~sampled_df['BDSPPatientID'].isin(duplicate_ids)]

# # Combine the kept duplicates with non-duplicates
# cleaned_sampled_df = pd.concat([non_duplicate_df, unique_duplicates_df], ignore_index=True)

# # Check how many more rows are needed to reach 10,000 unique BDSPPatientID
# current_unique_count = cleaned_sampled_df['BDSPPatientID'].nunique()
# additional_rows_needed = 10000 - current_unique_count

# if additional_rows_needed > 0:
#     # Identify non-sampled patients
#     remaining_patients = complete_df_initial[~complete_df_initial['BDSPPatientID'].isin(cleaned_sampled_df['BDSPPatientID'])]
    
#     # Sample additional rows to reach the desired count
#     additional_df = remaining_patients.sample(n=additional_rows_needed, random_state=103)
    
#     # Combine cleaned sample with additional rows
#     final_df = pd.concat([cleaned_sampled_df, additional_df], ignore_index=True)
# else:
#     final_df = cleaned_sampled_df

# # Ensure final_df contains 10,000 unique BDSPPatientID values
# final_df = final_df.drop_duplicates(subset='BDSPPatientID', keep='first')
# final_df = final_df.reset_index(drop=True)

# # Check for and remove any additional duplicates, ensuring exactly 10,000 rows
# while final_df['BDSPPatientID'].nunique() < 10000:
#     current_count = final_df['BDSPPatientID'].nunique()
#     additional_needed = 10000 - current_count
    
#     remaining_patients = complete_df_initial[~complete_df_initial['BDSPPatientID'].isin(final_df['BDSPPatientID'])]
#     additional_df = remaining_patients.sample(n=additional_needed, random_state=104)
    
#     final_df = pd.concat([final_df, additional_df], ignore_index=True)
#     final_df = final_df.drop_duplicates(subset='BDSPPatientID', keep='first').reset_index(drop=True)

# # Save the final DataFrame to a new CSV file
# final_df.to_csv("/home/gregory178/Desktop/NAX project/FM_Draft_15/10000_rows_BI.csv", index=False)

# print("Final DataFrame with 10,000 unique BDSPPatientID rows saved successfully.")



Final DataFrame with 10,000 unique BDSPPatientID rows saved successfully.
